The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From c784be435d5dae28d3b03db31753dd7a18733f0c Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego(a)linux.vnet.ibm.com>
Date: Wed, 15 May 2019 13:15:52 +0530
Subject: [PATCH] powerpc/pseries: Fix cpu_hotplug_lock acquisition in
resize_hpt()
The calls to arch_add_memory()/arch_remove_memory() are always made
with the read-side cpu_hotplug_lock acquired via memory_hotplug_begin().
On pSeries, arch_add_memory()/arch_remove_memory() eventually call
resize_hpt() which in turn calls stop_machine() which acquires the
read-side cpu_hotplug_lock again, thereby resulting in the recursive
acquisition of this lock.
In the absence of CONFIG_PROVE_LOCKING, we hadn't observed a system
lockup during a memory hotplug operation because cpus_read_lock() is a
per-cpu rwsem read, which, in the fast-path (in the absence of the
writer, which in our case is a CPU-hotplug operation) simply
increments the read_count on the semaphore. Thus a recursive read in
the fast-path doesn't cause any problems.
However, we can hit this problem in practice if there is a concurrent
CPU-Hotplug operation in progress which is waiting to acquire the
write-side of the lock. This will cause the second recursive read to
block until the writer finishes. While the writer is blocked since the
first read holds the lock. Thus both the reader as well as the writers
fail to make any progress thereby blocking both CPU-Hotplug as well as
Memory Hotplug operations.
Memory-Hotplug CPU-Hotplug
CPU 0 CPU 1
------ ------
1. down_read(cpu_hotplug_lock.rw_sem)
[memory_hotplug_begin]
2. down_write(cpu_hotplug_lock.rw_sem)
[cpu_up/cpu_down]
3. down_read(cpu_hotplug_lock.rw_sem)
[stop_machine()]
Lockdep complains as follows in these code-paths.
swapper/0/1 is trying to acquire lock:
(____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: stop_machine+0x2c/0x60
but task is already holding lock:
(____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: mem_hotplug_begin+0x20/0x50
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(cpu_hotplug_lock.rw_sem);
lock(cpu_hotplug_lock.rw_sem);
*** DEADLOCK ***
May be due to missing lock nesting notation
3 locks held by swapper/0/1:
#0: (____ptrval____) (&dev->mutex){....}, at: __driver_attach+0x12c/0x1b0
#1: (____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: mem_hotplug_begin+0x20/0x50
#2: (____ptrval____) (mem_hotplug_lock.rw_sem){++++}, at: percpu_down_write+0x54/0x1a0
stack backtrace:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.0.0-rc5-58373-gbc99402235f3-dirty #166
Call Trace:
dump_stack+0xe8/0x164 (unreliable)
__lock_acquire+0x1110/0x1c70
lock_acquire+0x240/0x290
cpus_read_lock+0x64/0xf0
stop_machine+0x2c/0x60
pseries_lpar_resize_hpt+0x19c/0x2c0
resize_hpt_for_hotplug+0x70/0xd0
arch_add_memory+0x58/0xfc
devm_memremap_pages+0x5e8/0x8f0
pmem_attach_disk+0x764/0x830
nvdimm_bus_probe+0x118/0x240
really_probe+0x230/0x4b0
driver_probe_device+0x16c/0x1e0
__driver_attach+0x148/0x1b0
bus_for_each_dev+0x90/0x130
driver_attach+0x34/0x50
bus_add_driver+0x1a8/0x360
driver_register+0x108/0x170
__nd_driver_register+0xd0/0xf0
nd_pmem_driver_init+0x34/0x48
do_one_initcall+0x1e0/0x45c
kernel_init_freeable+0x540/0x64c
kernel_init+0x2c/0x160
ret_from_kernel_thread+0x5c/0x68
Fix this issue by
1) Requiring all the calls to pseries_lpar_resize_hpt() be made
with cpu_hotplug_lock held.
2) In pseries_lpar_resize_hpt() invoke stop_machine_cpuslocked()
as a consequence of 1)
3) To satisfy 1), in hpt_order_set(), call mmu_hash_ops.resize_hpt()
with cpu_hotplug_lock held.
Fixes: dbcf929c0062 ("powerpc/pseries: Add support for hash table resizing")
Cc: stable(a)vger.kernel.org # v4.11+
Reported-by: Aneesh Kumar K.V <aneesh.kumar(a)linux.ibm.com>
Signed-off-by: Gautham R. Shenoy <ego(a)linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/1557906352-29048-1-git-send-email-ego@linux.vnet.…
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index e6d471058597..c363e850550e 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -34,6 +34,7 @@
#include <linux/libfdt.h>
#include <linux/pkeys.h>
#include <linux/hugetlb.h>
+#include <linux/cpu.h>
#include <asm/debugfs.h>
#include <asm/processor.h>
@@ -1931,10 +1932,16 @@ static int hpt_order_get(void *data, u64 *val)
static int hpt_order_set(void *data, u64 val)
{
+ int ret;
+
if (!mmu_hash_ops.resize_hpt)
return -ENODEV;
- return mmu_hash_ops.resize_hpt(val);
+ cpus_read_lock();
+ ret = mmu_hash_ops.resize_hpt(val);
+ cpus_read_unlock();
+
+ return ret;
}
DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 09bb878c21e0..4f76e5f30c97 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -1413,7 +1413,10 @@ static int pseries_lpar_resize_hpt_commit(void *data)
return 0;
}
-/* Must be called in user context */
+/*
+ * Must be called in process context. The caller must hold the
+ * cpus_lock.
+ */
static int pseries_lpar_resize_hpt(unsigned long shift)
{
struct hpt_resize_state state = {
@@ -1467,7 +1470,8 @@ static int pseries_lpar_resize_hpt(unsigned long shift)
t1 = ktime_get();
- rc = stop_machine(pseries_lpar_resize_hpt_commit, &state, NULL);
+ rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit,
+ &state, NULL);
t2 = ktime_get();
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From e9e006f5fcf2bab59149cb38a48a4817c1b538b4 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi(a)redhat.com>
Date: Sun, 4 Aug 2019 14:10:06 -0500
Subject: [PATCH] nbd: fix max number of supported devs
This fixes a bug added in 4.10 with commit:
commit 9561a7ade0c205bc2ee035a2ac880478dcc1a024
Author: Josef Bacik <jbacik(a)fb.com>
Date: Tue Nov 22 14:04:40 2016 -0500
nbd: add multi-connection support
that limited the number of devices to 256. Before the patch we could
create 1000s of devices, but the patch switched us from using our
own thread to using a work queue which has a default limit of 256
active works.
The problem is that our recv_work function sits in a loop until
disconnection but only handles IO for one connection. The work is
started when the connection is started/restarted, but if we end up
creating 257 or more connections, the queue_work call just queues
connection257+'s recv_work and that waits for connection 1 - 256's
recv_work to be disconnected and that work instance completing.
Instead of reverting back to kthreads, this has us allocate a
workqueue_struct per device, so we can block in the work.
Cc: stable(a)vger.kernel.org
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Mike Christie <mchristi(a)redhat.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 98c618e5732c..a8e3815295fe 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -108,6 +108,7 @@ struct nbd_device {
struct nbd_config *config;
struct mutex config_lock;
struct gendisk *disk;
+ struct workqueue_struct *recv_workq;
struct list_head list;
struct task_struct *task_recv;
@@ -139,7 +140,6 @@ static struct dentry *nbd_dbg_dir;
static unsigned int nbds_max = 16;
static int max_part = 16;
-static struct workqueue_struct *recv_workqueue;
static int part_shift;
static int nbd_dev_dbg_init(struct nbd_device *nbd);
@@ -1058,7 +1058,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
/* We take the tx_mutex in an error path in the recv_work, so we
* need to queue_work outside of the tx_mutex.
*/
- queue_work(recv_workqueue, &args->work);
+ queue_work(nbd->recv_workq, &args->work);
atomic_inc(&config->live_connections);
wake_up(&config->conn_wait);
@@ -1159,6 +1159,10 @@ static void nbd_config_put(struct nbd_device *nbd)
kfree(nbd->config);
nbd->config = NULL;
+ if (nbd->recv_workq)
+ destroy_workqueue(nbd->recv_workq);
+ nbd->recv_workq = NULL;
+
nbd->tag_set.timeout = 0;
nbd->disk->queue->limits.discard_granularity = 0;
nbd->disk->queue->limits.discard_alignment = 0;
@@ -1187,6 +1191,14 @@ static int nbd_start_device(struct nbd_device *nbd)
return -EINVAL;
}
+ nbd->recv_workq = alloc_workqueue("knbd%d-recv",
+ WQ_MEM_RECLAIM | WQ_HIGHPRI |
+ WQ_UNBOUND, 0, nbd->index);
+ if (!nbd->recv_workq) {
+ dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n");
+ return -ENOMEM;
+ }
+
blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
nbd->task_recv = current;
@@ -1217,7 +1229,7 @@ static int nbd_start_device(struct nbd_device *nbd)
INIT_WORK(&args->work, recv_work);
args->nbd = nbd;
args->index = i;
- queue_work(recv_workqueue, &args->work);
+ queue_work(nbd->recv_workq, &args->work);
}
nbd_size_update(nbd);
return error;
@@ -1237,8 +1249,10 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b
mutex_unlock(&nbd->config_lock);
ret = wait_event_interruptible(config->recv_wq,
atomic_read(&config->recv_threads) == 0);
- if (ret)
+ if (ret) {
sock_shutdown(nbd);
+ flush_workqueue(nbd->recv_workq);
+ }
mutex_lock(&nbd->config_lock);
nbd_bdev_reset(bdev);
/* user requested, ignore socket errors */
@@ -1899,6 +1913,12 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd)
nbd_disconnect(nbd);
nbd_clear_sock(nbd);
mutex_unlock(&nbd->config_lock);
+ /*
+ * Make sure recv thread has finished, so it does not drop the last
+ * config ref and try to destroy the workqueue from inside the work
+ * queue.
+ */
+ flush_workqueue(nbd->recv_workq);
if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
&nbd->config->runtime_flags))
nbd_config_put(nbd);
@@ -2283,20 +2303,12 @@ static int __init nbd_init(void)
if (nbds_max > 1UL << (MINORBITS - part_shift))
return -EINVAL;
- recv_workqueue = alloc_workqueue("knbd-recv",
- WQ_MEM_RECLAIM | WQ_HIGHPRI |
- WQ_UNBOUND, 0);
- if (!recv_workqueue)
- return -ENOMEM;
- if (register_blkdev(NBD_MAJOR, "nbd")) {
- destroy_workqueue(recv_workqueue);
+ if (register_blkdev(NBD_MAJOR, "nbd"))
return -EIO;
- }
if (genl_register_family(&nbd_genl_family)) {
unregister_blkdev(NBD_MAJOR, "nbd");
- destroy_workqueue(recv_workqueue);
return -EINVAL;
}
nbd_dbg_init();
@@ -2338,7 +2350,6 @@ static void __exit nbd_cleanup(void)
idr_destroy(&nbd_index_idr);
genl_unregister_family(&nbd_genl_family);
- destroy_workqueue(recv_workqueue);
unregister_blkdev(NBD_MAJOR, "nbd");
}
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: c49a0a80137c7ca7d6ced4c812c9e07a949f6f24
Gitweb: https://git.kernel.org/tip/c49a0a80137c7ca7d6ced4c812c9e07a949f6f24
Author: Tom Lendacky <thomas.lendacky(a)amd.com>
AuthorDate: Mon, 19 Aug 2019 15:52:35
Committer: Borislav Petkov <bp(a)suse.de>
CommitterDate: Mon, 19 Aug 2019 19:42:52 +02:00
x86/CPU/AMD: Clear RDRAND CPUID bit on AMD family 15h/16h
There have been reports of RDRAND issues after resuming from suspend on
some AMD family 15h and family 16h systems. This issue stems from a BIOS
not performing the proper steps during resume to ensure RDRAND continues
to function properly.
RDRAND support is indicated by CPUID Fn00000001_ECX[30]. This bit can be
reset by clearing MSR C001_1004[62]. Any software that checks for RDRAND
support using CPUID, including the kernel, will believe that RDRAND is
not supported.
Update the CPU initialization to clear the RDRAND CPUID bit for any family
15h and 16h processor that supports RDRAND. If it is known that the family
15h or family 16h system does not have an RDRAND resume issue or that the
system will not be placed in suspend, the "rdrand=force" kernel parameter
can be used to stop the clearing of the RDRAND CPUID bit.
Additionally, update the suspend and resume path to save and restore the
MSR C001_1004 value to ensure that the RDRAND CPUID setting remains in
place after resuming from suspend.
Note, that clearing the RDRAND CPUID bit does not prevent a processor
that normally supports the RDRAND instruction from executing it. So any
code that determined the support based on family and model won't #UD.
Signed-off-by: Tom Lendacky <thomas.lendacky(a)amd.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Cc: Andrew Cooper <andrew.cooper3(a)citrix.com>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Chen Yu <yu.c.chen(a)intel.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Josh Poimboeuf <jpoimboe(a)redhat.com>
Cc: Juergen Gross <jgross(a)suse.com>
Cc: Kees Cook <keescook(a)chromium.org>
Cc: "linux-doc(a)vger.kernel.org" <linux-doc(a)vger.kernel.org>
Cc: "linux-pm(a)vger.kernel.org" <linux-pm(a)vger.kernel.org>
Cc: Nathan Chancellor <natechancellor(a)gmail.com>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Cc: Pavel Machek <pavel(a)ucw.cz>
Cc: "Rafael J. Wysocki" <rjw(a)rjwysocki.net>
Cc: <stable(a)vger.kernel.org>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: "x86(a)kernel.org" <x86(a)kernel.org>
Link: https://lkml.kernel.org/r/7543af91666f491547bd86cebb1e17c66824ab9f.15662299…
---
Documentation/admin-guide/kernel-parameters.txt | 7 +-
arch/x86/include/asm/msr-index.h | 1 +-
arch/x86/kernel/cpu/amd.c | 66 +------------
arch/x86/power/cpu.c | 86 ++--------------
4 files changed, 13 insertions(+), 147 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 4c19719..47d981a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4090,13 +4090,6 @@
Run specified binary instead of /init from the ramdisk,
used for early userspace startup. See initrd.
- rdrand= [X86]
- force - Override the decision by the kernel to hide the
- advertisement of RDRAND support (this affects
- certain AMD processors because of buggy BIOS
- support, specifically around the suspend/resume
- path).
-
rdt= [HW,X86,RDT]
Turn on/off individual RDT features. List is:
cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, l2cdp,
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 271d837..6b4fc27 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -381,7 +381,6 @@
#define MSR_AMD64_PATCH_LEVEL 0x0000008b
#define MSR_AMD64_TSC_RATIO 0xc0000104
#define MSR_AMD64_NB_CFG 0xc001001f
-#define MSR_AMD64_CPUID_FN_1 0xc0011004
#define MSR_AMD64_PATCH_LOADER 0xc0010020
#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
#define MSR_AMD64_OSVW_STATUS 0xc0010141
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 68c363c..8d4e504 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -804,64 +804,6 @@ static void init_amd_ln(struct cpuinfo_x86 *c)
msr_set_bit(MSR_AMD64_DE_CFG, 31);
}
-static bool rdrand_force;
-
-static int __init rdrand_cmdline(char *str)
-{
- if (!str)
- return -EINVAL;
-
- if (!strcmp(str, "force"))
- rdrand_force = true;
- else
- return -EINVAL;
-
- return 0;
-}
-early_param("rdrand", rdrand_cmdline);
-
-static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c)
-{
- /*
- * Saving of the MSR used to hide the RDRAND support during
- * suspend/resume is done by arch/x86/power/cpu.c, which is
- * dependent on CONFIG_PM_SLEEP.
- */
- if (!IS_ENABLED(CONFIG_PM_SLEEP))
- return;
-
- /*
- * The nordrand option can clear X86_FEATURE_RDRAND, so check for
- * RDRAND support using the CPUID function directly.
- */
- if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force)
- return;
-
- msr_clear_bit(MSR_AMD64_CPUID_FN_1, 62);
-
- /*
- * Verify that the CPUID change has occurred in case the kernel is
- * running virtualized and the hypervisor doesn't support the MSR.
- */
- if (cpuid_ecx(1) & BIT(30)) {
- pr_info_once("BIOS may not properly restore RDRAND after suspend, but hypervisor does not support hiding RDRAND via CPUID.\n");
- return;
- }
-
- clear_cpu_cap(c, X86_FEATURE_RDRAND);
- pr_info_once("BIOS may not properly restore RDRAND after suspend, hiding RDRAND via CPUID. Use rdrand=force to reenable.\n");
-}
-
-static void init_amd_jg(struct cpuinfo_x86 *c)
-{
- /*
- * Some BIOS implementations do not restore proper RDRAND support
- * across suspend and resume. Check on whether to hide the RDRAND
- * instruction support via CPUID.
- */
- clear_rdrand_cpuid_bit(c);
-}
-
static void init_amd_bd(struct cpuinfo_x86 *c)
{
u64 value;
@@ -876,13 +818,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
wrmsrl_safe(MSR_F15H_IC_CFG, value);
}
}
-
- /*
- * Some BIOS implementations do not restore proper RDRAND support
- * across suspend and resume. Check on whether to hide the RDRAND
- * instruction support via CPUID.
- */
- clear_rdrand_cpuid_bit(c);
}
static void init_amd_zn(struct cpuinfo_x86 *c)
@@ -925,7 +860,6 @@ static void init_amd(struct cpuinfo_x86 *c)
case 0x10: init_amd_gh(c); break;
case 0x12: init_amd_ln(c); break;
case 0x15: init_amd_bd(c); break;
- case 0x16: init_amd_jg(c); break;
case 0x17: init_amd_zn(c); break;
}
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index c9ef6a7..24b079e 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -12,7 +12,6 @@
#include <linux/smp.h>
#include <linux/perf_event.h>
#include <linux/tboot.h>
-#include <linux/dmi.h>
#include <asm/pgtable.h>
#include <asm/proto.h>
@@ -24,7 +23,7 @@
#include <asm/debugreg.h>
#include <asm/cpu.h>
#include <asm/mmu_context.h>
-#include <asm/cpu_device_id.h>
+#include <linux/dmi.h>
#ifdef CONFIG_X86_32
__visible unsigned long saved_context_ebx;
@@ -398,14 +397,15 @@ static int __init bsp_pm_check_init(void)
core_initcall(bsp_pm_check_init);
-static int msr_build_context(const u32 *msr_id, const int num)
+static int msr_init_context(const u32 *msr_id, const int total_num)
{
- struct saved_msrs *saved_msrs = &saved_context.saved_msrs;
+ int i = 0;
struct saved_msr *msr_array;
- int total_num;
- int i, j;
- total_num = saved_msrs->num + num;
+ if (saved_context.saved_msrs.array || saved_context.saved_msrs.num > 0) {
+ pr_err("x86/pm: MSR quirk already applied, please check your DMI match table.\n");
+ return -EINVAL;
+ }
msr_array = kmalloc_array(total_num, sizeof(struct saved_msr), GFP_KERNEL);
if (!msr_array) {
@@ -413,30 +413,19 @@ static int msr_build_context(const u32 *msr_id, const int num)
return -ENOMEM;
}
- if (saved_msrs->array) {
- /*
- * Multiple callbacks can invoke this function, so copy any
- * MSR save requests from previous invocations.
- */
- memcpy(msr_array, saved_msrs->array,
- sizeof(struct saved_msr) * saved_msrs->num);
-
- kfree(saved_msrs->array);
- }
-
- for (i = saved_msrs->num, j = 0; i < total_num; i++, j++) {
- msr_array[i].info.msr_no = msr_id[j];
+ for (i = 0; i < total_num; i++) {
+ msr_array[i].info.msr_no = msr_id[i];
msr_array[i].valid = false;
msr_array[i].info.reg.q = 0;
}
- saved_msrs->num = total_num;
- saved_msrs->array = msr_array;
+ saved_context.saved_msrs.num = total_num;
+ saved_context.saved_msrs.array = msr_array;
return 0;
}
/*
- * The following sections are a quirk framework for problematic BIOSen:
+ * The following section is a quirk framework for problematic BIOSen:
* Sometimes MSRs are modified by the BIOSen after suspended to
* RAM, this might cause unexpected behavior after wakeup.
* Thus we save/restore these specified MSRs across suspend/resume
@@ -451,7 +440,7 @@ static int msr_initialize_bdw(const struct dmi_system_id *d)
u32 bdw_msr_id[] = { MSR_IA32_THERM_CONTROL };
pr_info("x86/pm: %s detected, MSR saving is needed during suspending.\n", d->ident);
- return msr_build_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id));
+ return msr_init_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id));
}
static const struct dmi_system_id msr_save_dmi_table[] = {
@@ -466,58 +455,9 @@ static const struct dmi_system_id msr_save_dmi_table[] = {
{}
};
-static int msr_save_cpuid_features(const struct x86_cpu_id *c)
-{
- u32 cpuid_msr_id[] = {
- MSR_AMD64_CPUID_FN_1,
- };
-
- pr_info("x86/pm: family %#hx cpu detected, MSR saving is needed during suspending.\n",
- c->family);
-
- return msr_build_context(cpuid_msr_id, ARRAY_SIZE(cpuid_msr_id));
-}
-
-static const struct x86_cpu_id msr_save_cpu_table[] = {
- {
- .vendor = X86_VENDOR_AMD,
- .family = 0x15,
- .model = X86_MODEL_ANY,
- .feature = X86_FEATURE_ANY,
- .driver_data = (kernel_ulong_t)msr_save_cpuid_features,
- },
- {
- .vendor = X86_VENDOR_AMD,
- .family = 0x16,
- .model = X86_MODEL_ANY,
- .feature = X86_FEATURE_ANY,
- .driver_data = (kernel_ulong_t)msr_save_cpuid_features,
- },
- {}
-};
-
-typedef int (*pm_cpu_match_t)(const struct x86_cpu_id *);
-static int pm_cpu_check(const struct x86_cpu_id *c)
-{
- const struct x86_cpu_id *m;
- int ret = 0;
-
- m = x86_match_cpu(msr_save_cpu_table);
- if (m) {
- pm_cpu_match_t fn;
-
- fn = (pm_cpu_match_t)m->driver_data;
- ret = fn(m);
- }
-
- return ret;
-}
-
static int pm_check_save_msr(void)
{
dmi_check_system(msr_save_dmi_table);
- pm_cpu_check(msr_save_cpu_table);
-
return 0;
}
In perf_rotate_context(), when the first cpu flexible event fail to
schedule, cpu_rotate is 1, while cpu_event is NULL. Since cpu_event is
NULL, perf_rotate_context will _NOT_ call cpu_ctx_sched_out(), thus
cpuctx->ctx.is_active will have EVENT_FLEXIBLE set. Then, the next
perf_event_sched_in() will skip all cpu flexible events because of the
EVENT_FLEXIBLE bit.
In the next call of perf_rotate_context(), cpu_rotate stays 1, and
cpu_event stays NULL, so this process repeats. The end result is, flexible
events on this cpu will not be scheduled (until another event being added
to the cpuctx).
Here is an easy repro of this issue. On Intel CPUs, where ref-cycles
could only use one counter, run one pinned event for ref-cycles, one
flexible event for ref-cycles, and one flexible event for cycles. The
flexible ref-cycles is never scheduled, which is expected. However,
because of this issue, the cycles event is never scheduled either.
perf stat -e ref-cycles:D,ref-cycles,cycles -C 5 -I 1000
time counts unit events
1.000152973 15,412,480 ref-cycles:D
1.000152973 <not counted> ref-cycles (0.00%)
1.000152973 <not counted> cycles (0.00%)
2.000486957 18,263,120 ref-cycles:D
2.000486957 <not counted> ref-cycles (0.00%)
2.000486957 <not counted> cycles (0.00%)
To fix this, when the flexible_active list is empty, try rotate the
first event in the flexible_groups. Also, rename ctx_first_active() to
ctx_event_to_rotate(), which is more accurate.
Fixes: 8d5bce0c37fa ("perf/core: Optimize perf_rotate_context() event scheduling")
Cc: stable(a)vger.kernel.org # v4.17+
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Arnaldo Carvalho de Melo <acme(a)kernel.org>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Sasha Levin <sashal(a)kernel.org>
Signed-off-by: Song Liu <songliubraving(a)fb.com>
---
kernel/events/core.c | 20 +++++++++++++++-----
1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3f0cb82e4fbc..b96cefed4fb2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3779,11 +3779,21 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
perf_event_groups_insert(&ctx->flexible_groups, event);
}
+/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
-ctx_first_active(struct perf_event_context *ctx)
+ctx_event_to_rotate(struct perf_event_context *ctx)
{
- return list_first_entry_or_null(&ctx->flexible_active,
- struct perf_event, active_list);
+ struct perf_event *event;
+
+ /* pick the first active flexible event */
+ event = list_first_entry_or_null(&ctx->flexible_active,
+ struct perf_event, active_list);
+
+ /* if no active flexible event, pick the first event */
+ if (!event)
+ event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
+ typeof(*event), group_node);
+ return event;
}
static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
@@ -3808,9 +3818,9 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
perf_pmu_disable(cpuctx->ctx.pmu);
if (task_rotate)
- task_event = ctx_first_active(task_ctx);
+ task_event = ctx_event_to_rotate(task_ctx);
if (cpu_rotate)
- cpu_event = ctx_first_active(&cpuctx->ctx);
+ cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
/*
* As per the order given at ctx_resched() first 'pop' task flexible
--
2.17.1
Hello,
We ran automated tests on a patchset that was proposed for merging into this
kernel tree. The patches were applied to:
Kernel repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
Commit: 52020d3f6633 - Linux 5.3.5
The results of these automated tests are provided below.
Overall result: FAILED (see details below)
Merge: OK
Compile: FAILED
All kernel binaries, config files, and logs are available for download here:
https://artifacts.cki-project.org/pipelines/214101
We attempted to compile the kernel for multiple architectures, but the compile
failed on one or more architectures:
aarch64: FAILED (see build-aarch64.log.xz attachment)
ppc64le: FAILED (see build-ppc64le.log.xz attachment)
x86_64: FAILED (see build-x86_64.log.xz attachment)
We hope that these logs can help you find the problem quickly. For the full
detail on our testing procedures, please scroll to the bottom of this message.
Please reply to this email if you have any questions about the tests that we
ran or if you have any suggestions on how to make future tests more effective.
,-. ,-.
( C ) ( K ) Continuous
`-',-.`-' Kernel
( I ) Integration
`-'
______________________________________________________________________________
Merge testing
-------------
We cloned this repository and checked out the following commit:
Repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
Commit: 52020d3f6633 - Linux 5.3.5
We grabbed the 59cbf36cb5d9 commit of the stable queue repository.
We then merged the patchset with `git am`:
s390-process-avoid-potential-reading-of-freed-stack.patch
s390-sclp-fix-bit-checked-for-has_sipl.patch
kvm-s390-test-for-bad-access-register-and-size-at-the-start-of-s390_mem_op.patch
s390-topology-avoid-firing-events-before-kobjs-are-created.patch
s390-cio-avoid-calling-strlen-on-null-pointer.patch
s390-cio-exclude-subchannels-with-no-parent-from-pseudo-check.patch
s390-dasd-fix-error-handling-during-online-processing.patch
revert-s390-dasd-add-discard-support-for-ese-volumes.patch
kvm-s390-fix-__insn32_query-inline-assembly.patch
kvm-ppc-book3s-enable-xive-native-capability-only-if-opal-has-required-functions.patch
kvm-ppc-book3s-hv-xive-free-escalation-interrupts-before-disabling-the-vp.patch
kvm-ppc-book3s-hv-don-t-push-xive-context-when-not-using-xive-device.patch
kvm-ppc-book3s-hv-fix-race-in-re-enabling-xive-escalation-interrupts.patch
kvm-ppc-book3s-hv-check-for-mmu-ready-on-piggybacked-virtual-cores.patch
kvm-ppc-book3s-hv-don-t-lose-pending-doorbell-request-on-migration-on-p9.patch
kvm-x86-fix-userspace-set-invalid-cr4.patch
nbd-fix-max-number-of-supported-devs.patch
pm-devfreq-tegra-fix-khz-to-hz-conversion.patch
asoc-define-a-set-of-dapm-pre-post-up-events.patch
asoc-sgtl5000-improve-vag-power-and-mute-control.patch
powerpc-xive-implement-get_irqchip_state-method-for-xive-to-fix-shutdown-race.patch
powerpc-mce-fix-mce-handling-for-huge-pages.patch
powerpc-mce-schedule-work-from-irq_work.patch
powerpc-603-fix-handling-of-the-dirty-flag.patch
powerpc-32s-fix-boot-failure-with-debug_pagealloc-without-kasan.patch
powerpc-ptdump-fix-addresses-display-on-ppc32.patch
powerpc-powernv-restrict-opal-symbol-map-to-only-be-readable-by-root.patch
powerpc-pseries-fix-cpu_hotplug_lock-acquisition-in-resize_hpt.patch
powerpc-powernv-ioda-fix-race-in-tce-level-allocation.patch
powerpc-kasan-fix-parallel-loading-of-modules.patch
powerpc-kasan-fix-shadow-area-set-up-for-modules.patch
powerpc-book3s64-mm-don-t-do-tlbie-fixup-for-some-hardware-revisions.patch
powerpc-book3s64-radix-rename-cpu_ftr_p9_tlbie_bug-feature-flag.patch
powerpc-mm-add-a-helper-to-select-page_kernel_ro-or-page_readonly.patch
powerpc-mm-fix-an-oops-in-kasan_mmu_init.patch
powerpc-mm-fixup-tlbie-vs-mtpidr-mtlpidr-ordering-issue-on-power9.patch
can-mcp251x-mcp251x_hw_reset-allow-more-time-after-a-reset.patch
tools-lib-traceevent-fix-robust-test-of-do_generate_dynamic_list_file.patch
tools-lib-traceevent-do-not-free-tep-cmdlines-in-add_new_comm-on-failure.patch
crypto-qat-silence-smp_processor_id-warning.patch
crypto-skcipher-unmap-pages-after-an-external-error.patch
crypto-cavium-zip-add-missing-single_release.patch
crypto-caam-qi-fix-error-handling-in-ern-handler.patch
crypto-caam-fix-concurrency-issue-in-givencrypt-descriptor.patch
crypto-ccree-account-for-tee-not-ready-to-report.patch
crypto-ccree-use-the-full-crypt-length-value.patch
mips-treat-loongson-extensions-as-ases.patch
power-supply-sbs-battery-use-correct-flags-field.patch
power-supply-sbs-battery-only-return-health-when-battery-present.patch
tracing-make-sure-variable-reference-alias-has-correct-var_ref_idx.patch
usercopy-avoid-highmem-pfn-warning.patch
timer-read-jiffies-once-when-forwarding-base-clk.patch
pci-vmd-fix-config-addressing-when-using-bus-offsets.patch
pci-hv-avoid-use-of-hv_pci_dev-pci_slot-after-freeing-it.patch
pci-vmd-fix-shadow-offsets-to-reflect-spec-changes.patch
pci-restore-resizable-bar-size-bits-correctly-for-1mb-bars.patch
selftests-tpm2-add-the-missing-test_files-assignment.patch
selftests-pidfd-fix-undefined-reference-to-pthread_create.patch
watchdog-imx2_wdt-fix-min-calculation-in-imx2_wdt_set_timeout.patch
perf-tools-fix-segfault-in-cpu_cache_level__read.patch
perf-stat-fix-a-segmentation-fault-when-using-repeat-forever.patch
drm-i915-dp-fix-dsc-bpp-calculations-v5.patch
drm-atomic-reject-flip_async-unconditionally.patch
drm-atomic-take-the-atomic-toys-away-from-x.patch
drm-mali-dp-mark-expected-switch-fall-through.patch
drm-omap-fix-max-fclk-divider-for-omap36xx.patch
drm-msm-dsi-fix-return-value-check-for-clk_get_parent.patch
drm-nouveau-kms-nv50-don-t-create-mstms-for-edp-connectors.patch
drm-amd-powerplay-change-metrics-update-period-from-1ms-to-100ms.patch
drm-i915-gvt-update-vgpu-workload-head-pointer-correctly.patch
drm-i915-userptr-acquire-the-page-lock-around-set_page_dirty.patch
drm-i915-to-make-vgpu-ppgtt-notificaiton-as-atomic-operation.patch
mac80211-keep-bhs-disabled-while-calling-drv_tx_wake_queue.patch
mmc-tegra-implement-set_dma_mask.patch
mmc-sdhci-improve-adma-error-reporting.patch
mmc-sdhci-of-esdhc-set-dma-snooping-based-on-dma-coherence.patch
mmc-sdhci-let-drivers-define-their-dma-mask.patch
revert-locking-pvqspinlock-don-t-wait-if-vcpu-is-preempted.patch
libnvdimm-altmap-track-namespace-boundaries-in-altmap.patch
libnvdimm-prevent-nvdimm-from-requesting-key-when-security-is-disabled.patch
sched-add-__assembly__-guards-around-struct-clone_args.patch
dts-arm-gta04-introduce-legacy-spi-cs-high-to-make-display-work-again.patch
xen-balloon-set-pages-pageoffline-in-balloon_add_region.patch
xen-xenbus-fix-self-deadlock-after-killing-user-process.patch
ieee802154-atusb-fix-use-after-free-at-disconnect.patch
nl80211-validate-beacon-head.patch
cfg80211-validate-ssid-mbssid-element-ordering-assumption.patch
cfg80211-initialize-on-stack-chandefs.patch
Compile testing
---------------
We compiled the kernel for 4 architectures:
aarch64:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
ppc64le:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
s390x:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
x86_64:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
Hello,
We ran automated tests on a patchset that was proposed for merging into this
kernel tree. The patches were applied to:
Kernel repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
Commit: 52020d3f6633 - Linux 5.3.5
The results of these automated tests are provided below.
Overall result: FAILED (see details below)
Merge: OK
Compile: FAILED
All kernel binaries, config files, and logs are available for download here:
https://artifacts.cki-project.org/pipelines/214089
We attempted to compile the kernel for multiple architectures, but the compile
failed on one or more architectures:
aarch64: FAILED (see build-aarch64.log.xz attachment)
ppc64le: FAILED (see build-ppc64le.log.xz attachment)
x86_64: FAILED (see build-x86_64.log.xz attachment)
We hope that these logs can help you find the problem quickly. For the full
detail on our testing procedures, please scroll to the bottom of this message.
Please reply to this email if you have any questions about the tests that we
ran or if you have any suggestions on how to make future tests more effective.
,-. ,-.
( C ) ( K ) Continuous
`-',-.`-' Kernel
( I ) Integration
`-'
______________________________________________________________________________
Merge testing
-------------
We cloned this repository and checked out the following commit:
Repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
Commit: 52020d3f6633 - Linux 5.3.5
We grabbed the 135f0b7c29f2 commit of the stable queue repository.
We then merged the patchset with `git am`:
s390-process-avoid-potential-reading-of-freed-stack.patch
s390-sclp-fix-bit-checked-for-has_sipl.patch
kvm-s390-test-for-bad-access-register-and-size-at-the-start-of-s390_mem_op.patch
s390-topology-avoid-firing-events-before-kobjs-are-created.patch
s390-cio-avoid-calling-strlen-on-null-pointer.patch
s390-cio-exclude-subchannels-with-no-parent-from-pseudo-check.patch
s390-dasd-fix-error-handling-during-online-processing.patch
revert-s390-dasd-add-discard-support-for-ese-volumes.patch
kvm-s390-fix-__insn32_query-inline-assembly.patch
kvm-ppc-book3s-enable-xive-native-capability-only-if-opal-has-required-functions.patch
kvm-ppc-book3s-hv-xive-free-escalation-interrupts-before-disabling-the-vp.patch
kvm-ppc-book3s-hv-don-t-push-xive-context-when-not-using-xive-device.patch
kvm-ppc-book3s-hv-fix-race-in-re-enabling-xive-escalation-interrupts.patch
kvm-ppc-book3s-hv-check-for-mmu-ready-on-piggybacked-virtual-cores.patch
kvm-ppc-book3s-hv-don-t-lose-pending-doorbell-request-on-migration-on-p9.patch
kvm-x86-fix-userspace-set-invalid-cr4.patch
nbd-fix-max-number-of-supported-devs.patch
pm-devfreq-tegra-fix-khz-to-hz-conversion.patch
asoc-define-a-set-of-dapm-pre-post-up-events.patch
asoc-sgtl5000-improve-vag-power-and-mute-control.patch
powerpc-xive-implement-get_irqchip_state-method-for-xive-to-fix-shutdown-race.patch
powerpc-mce-fix-mce-handling-for-huge-pages.patch
powerpc-mce-schedule-work-from-irq_work.patch
powerpc-603-fix-handling-of-the-dirty-flag.patch
powerpc-32s-fix-boot-failure-with-debug_pagealloc-without-kasan.patch
powerpc-ptdump-fix-addresses-display-on-ppc32.patch
powerpc-powernv-restrict-opal-symbol-map-to-only-be-readable-by-root.patch
powerpc-pseries-fix-cpu_hotplug_lock-acquisition-in-resize_hpt.patch
powerpc-powernv-ioda-fix-race-in-tce-level-allocation.patch
powerpc-kasan-fix-parallel-loading-of-modules.patch
powerpc-kasan-fix-shadow-area-set-up-for-modules.patch
powerpc-book3s64-mm-don-t-do-tlbie-fixup-for-some-hardware-revisions.patch
powerpc-book3s64-radix-rename-cpu_ftr_p9_tlbie_bug-feature-flag.patch
powerpc-mm-add-a-helper-to-select-page_kernel_ro-or-page_readonly.patch
powerpc-mm-fix-an-oops-in-kasan_mmu_init.patch
powerpc-mm-fixup-tlbie-vs-mtpidr-mtlpidr-ordering-issue-on-power9.patch
can-mcp251x-mcp251x_hw_reset-allow-more-time-after-a-reset.patch
tools-lib-traceevent-fix-robust-test-of-do_generate_dynamic_list_file.patch
tools-lib-traceevent-do-not-free-tep-cmdlines-in-add_new_comm-on-failure.patch
crypto-qat-silence-smp_processor_id-warning.patch
crypto-skcipher-unmap-pages-after-an-external-error.patch
crypto-cavium-zip-add-missing-single_release.patch
crypto-caam-qi-fix-error-handling-in-ern-handler.patch
crypto-caam-fix-concurrency-issue-in-givencrypt-descriptor.patch
crypto-ccree-account-for-tee-not-ready-to-report.patch
crypto-ccree-use-the-full-crypt-length-value.patch
mips-treat-loongson-extensions-as-ases.patch
power-supply-sbs-battery-use-correct-flags-field.patch
power-supply-sbs-battery-only-return-health-when-battery-present.patch
tracing-make-sure-variable-reference-alias-has-correct-var_ref_idx.patch
usercopy-avoid-highmem-pfn-warning.patch
timer-read-jiffies-once-when-forwarding-base-clk.patch
pci-vmd-fix-config-addressing-when-using-bus-offsets.patch
pci-hv-avoid-use-of-hv_pci_dev-pci_slot-after-freeing-it.patch
pci-vmd-fix-shadow-offsets-to-reflect-spec-changes.patch
pci-restore-resizable-bar-size-bits-correctly-for-1mb-bars.patch
selftests-tpm2-add-the-missing-test_files-assignment.patch
selftests-pidfd-fix-undefined-reference-to-pthread_create.patch
watchdog-imx2_wdt-fix-min-calculation-in-imx2_wdt_set_timeout.patch
perf-tools-fix-segfault-in-cpu_cache_level__read.patch
perf-stat-fix-a-segmentation-fault-when-using-repeat-forever.patch
drm-i915-dp-fix-dsc-bpp-calculations-v5.patch
drm-atomic-reject-flip_async-unconditionally.patch
drm-atomic-take-the-atomic-toys-away-from-x.patch
drm-mali-dp-mark-expected-switch-fall-through.patch
drm-omap-fix-max-fclk-divider-for-omap36xx.patch
drm-msm-dsi-fix-return-value-check-for-clk_get_parent.patch
drm-nouveau-kms-nv50-don-t-create-mstms-for-edp-connectors.patch
drm-amd-powerplay-change-metrics-update-period-from-1ms-to-100ms.patch
drm-i915-gvt-update-vgpu-workload-head-pointer-correctly.patch
drm-i915-userptr-acquire-the-page-lock-around-set_page_dirty.patch
drm-i915-flush-extra-hard-after-writing-relocations-through-the-gtt.patch
drm-i915-to-make-vgpu-ppgtt-notificaiton-as-atomic-operation.patch
mac80211-keep-bhs-disabled-while-calling-drv_tx_wake_queue.patch
mmc-tegra-implement-set_dma_mask.patch
mmc-sdhci-improve-adma-error-reporting.patch
mmc-sdhci-of-esdhc-set-dma-snooping-based-on-dma-coherence.patch
mmc-sdhci-let-drivers-define-their-dma-mask.patch
revert-locking-pvqspinlock-don-t-wait-if-vcpu-is-preempted.patch
libnvdimm-altmap-track-namespace-boundaries-in-altmap.patch
libnvdimm-prevent-nvdimm-from-requesting-key-when-security-is-disabled.patch
sched-add-__assembly__-guards-around-struct-clone_args.patch
dts-arm-gta04-introduce-legacy-spi-cs-high-to-make-display-work-again.patch
xen-balloon-set-pages-pageoffline-in-balloon_add_region.patch
xen-xenbus-fix-self-deadlock-after-killing-user-process.patch
ieee802154-atusb-fix-use-after-free-at-disconnect.patch
nl80211-validate-beacon-head.patch
cfg80211-validate-ssid-mbssid-element-ordering-assumption.patch
cfg80211-initialize-on-stack-chandefs.patch
Compile testing
---------------
We compiled the kernel for 4 architectures:
aarch64:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
ppc64le:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
s390x:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
x86_64:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
Hello,
We ran automated tests on a patchset that was proposed for merging into this
kernel tree. The patches were applied to:
Kernel repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
Commit: 52020d3f6633 - Linux 5.3.5
The results of these automated tests are provided below.
Overall result: FAILED (see details below)
Merge: OK
Compile: FAILED
All kernel binaries, config files, and logs are available for download here:
https://artifacts.cki-project.org/pipelines/214001
We attempted to compile the kernel for multiple architectures, but the compile
failed on one or more architectures:
aarch64: FAILED (see build-aarch64.log.xz attachment)
ppc64le: FAILED (see build-ppc64le.log.xz attachment)
x86_64: FAILED (see build-x86_64.log.xz attachment)
We hope that these logs can help you find the problem quickly. For the full
detail on our testing procedures, please scroll to the bottom of this message.
Please reply to this email if you have any questions about the tests that we
ran or if you have any suggestions on how to make future tests more effective.
,-. ,-.
( C ) ( K ) Continuous
`-',-.`-' Kernel
( I ) Integration
`-'
______________________________________________________________________________
Merge testing
-------------
We cloned this repository and checked out the following commit:
Repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
Commit: 52020d3f6633 - Linux 5.3.5
We grabbed the 65ceb57676c9 commit of the stable queue repository.
We then merged the patchset with `git am`:
s390-process-avoid-potential-reading-of-freed-stack.patch
s390-sclp-fix-bit-checked-for-has_sipl.patch
kvm-s390-test-for-bad-access-register-and-size-at-the-start-of-s390_mem_op.patch
s390-topology-avoid-firing-events-before-kobjs-are-created.patch
s390-cio-avoid-calling-strlen-on-null-pointer.patch
s390-cio-exclude-subchannels-with-no-parent-from-pseudo-check.patch
s390-dasd-fix-error-handling-during-online-processing.patch
revert-s390-dasd-add-discard-support-for-ese-volumes.patch
kvm-s390-fix-__insn32_query-inline-assembly.patch
kvm-ppc-book3s-enable-xive-native-capability-only-if-opal-has-required-functions.patch
kvm-ppc-book3s-hv-xive-free-escalation-interrupts-before-disabling-the-vp.patch
kvm-ppc-book3s-hv-don-t-push-xive-context-when-not-using-xive-device.patch
kvm-ppc-book3s-hv-fix-race-in-re-enabling-xive-escalation-interrupts.patch
kvm-ppc-book3s-hv-check-for-mmu-ready-on-piggybacked-virtual-cores.patch
kvm-ppc-book3s-hv-don-t-lose-pending-doorbell-request-on-migration-on-p9.patch
kvm-x86-fix-userspace-set-invalid-cr4.patch
nbd-fix-max-number-of-supported-devs.patch
pm-devfreq-tegra-fix-khz-to-hz-conversion.patch
asoc-define-a-set-of-dapm-pre-post-up-events.patch
asoc-sgtl5000-improve-vag-power-and-mute-control.patch
powerpc-xive-implement-get_irqchip_state-method-for-xive-to-fix-shutdown-race.patch
powerpc-mce-fix-mce-handling-for-huge-pages.patch
powerpc-mce-schedule-work-from-irq_work.patch
powerpc-603-fix-handling-of-the-dirty-flag.patch
powerpc-32s-fix-boot-failure-with-debug_pagealloc-without-kasan.patch
powerpc-ptdump-fix-addresses-display-on-ppc32.patch
powerpc-powernv-restrict-opal-symbol-map-to-only-be-readable-by-root.patch
powerpc-pseries-fix-cpu_hotplug_lock-acquisition-in-resize_hpt.patch
powerpc-powernv-ioda-fix-race-in-tce-level-allocation.patch
powerpc-kasan-fix-parallel-loading-of-modules.patch
powerpc-kasan-fix-shadow-area-set-up-for-modules.patch
powerpc-book3s64-mm-don-t-do-tlbie-fixup-for-some-hardware-revisions.patch
powerpc-book3s64-radix-rename-cpu_ftr_p9_tlbie_bug-feature-flag.patch
powerpc-mm-add-a-helper-to-select-page_kernel_ro-or-page_readonly.patch
powerpc-mm-fix-an-oops-in-kasan_mmu_init.patch
powerpc-mm-fixup-tlbie-vs-mtpidr-mtlpidr-ordering-issue-on-power9.patch
can-mcp251x-mcp251x_hw_reset-allow-more-time-after-a-reset.patch
tools-lib-traceevent-fix-robust-test-of-do_generate_dynamic_list_file.patch
tools-lib-traceevent-do-not-free-tep-cmdlines-in-add_new_comm-on-failure.patch
crypto-qat-silence-smp_processor_id-warning.patch
crypto-skcipher-unmap-pages-after-an-external-error.patch
crypto-cavium-zip-add-missing-single_release.patch
crypto-caam-qi-fix-error-handling-in-ern-handler.patch
crypto-caam-fix-concurrency-issue-in-givencrypt-descriptor.patch
crypto-ccree-account-for-tee-not-ready-to-report.patch
crypto-ccree-use-the-full-crypt-length-value.patch
mips-treat-loongson-extensions-as-ases.patch
power-supply-sbs-battery-use-correct-flags-field.patch
power-supply-sbs-battery-only-return-health-when-battery-present.patch
tracing-make-sure-variable-reference-alias-has-correct-var_ref_idx.patch
usercopy-avoid-highmem-pfn-warning.patch
timer-read-jiffies-once-when-forwarding-base-clk.patch
pci-vmd-fix-config-addressing-when-using-bus-offsets.patch
pci-hv-avoid-use-of-hv_pci_dev-pci_slot-after-freeing-it.patch
pci-vmd-fix-shadow-offsets-to-reflect-spec-changes.patch
pci-restore-resizable-bar-size-bits-correctly-for-1mb-bars.patch
selftests-tpm2-add-the-missing-test_files-assignment.patch
selftests-pidfd-fix-undefined-reference-to-pthread_create.patch
watchdog-imx2_wdt-fix-min-calculation-in-imx2_wdt_set_timeout.patch
perf-tools-fix-segfault-in-cpu_cache_level__read.patch
perf-stat-fix-a-segmentation-fault-when-using-repeat-forever.patch
drm-i915-dp-fix-dsc-bpp-calculations-v5.patch
drm-atomic-reject-flip_async-unconditionally.patch
drm-atomic-take-the-atomic-toys-away-from-x.patch
drm-mali-dp-mark-expected-switch-fall-through.patch
drm-omap-fix-max-fclk-divider-for-omap36xx.patch
drm-msm-dsi-fix-return-value-check-for-clk_get_parent.patch
drm-nouveau-kms-nv50-don-t-create-mstms-for-edp-connectors.patch
drm-amd-powerplay-change-metrics-update-period-from-1ms-to-100ms.patch
drm-i915-gvt-update-vgpu-workload-head-pointer-correctly.patch
drm-i915-userptr-acquire-the-page-lock-around-set_page_dirty.patch
drm-i915-use-maximum-write-flush-for-pwrite_gtt.patch
drm-i915-flush-extra-hard-after-writing-relocations-through-the-gtt.patch
drm-i915-to-make-vgpu-ppgtt-notificaiton-as-atomic-operation.patch
mac80211-keep-bhs-disabled-while-calling-drv_tx_wake_queue.patch
mmc-tegra-implement-set_dma_mask.patch
mmc-sdhci-improve-adma-error-reporting.patch
mmc-sdhci-of-esdhc-set-dma-snooping-based-on-dma-coherence.patch
mmc-sdhci-let-drivers-define-their-dma-mask.patch
revert-locking-pvqspinlock-don-t-wait-if-vcpu-is-preempted.patch
libnvdimm-altmap-track-namespace-boundaries-in-altmap.patch
libnvdimm-prevent-nvdimm-from-requesting-key-when-security-is-disabled.patch
sched-add-__assembly__-guards-around-struct-clone_args.patch
dts-arm-gta04-introduce-legacy-spi-cs-high-to-make-display-work-again.patch
xen-balloon-set-pages-pageoffline-in-balloon_add_region.patch
xen-xenbus-fix-self-deadlock-after-killing-user-process.patch
ieee802154-atusb-fix-use-after-free-at-disconnect.patch
nl80211-validate-beacon-head.patch
cfg80211-validate-ssid-mbssid-element-ordering-assumption.patch
cfg80211-initialize-on-stack-chandefs.patch
Compile testing
---------------
We compiled the kernel for 4 architectures:
aarch64:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
ppc64le:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
s390x:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
x86_64:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
The patch titled
Subject: mm, compaction: fix wrong pfn handling in __reset_isolation_pfn()
has been added to the -mm tree. Its filename is
mm-compaction-fix-wrong-pfn-handling-in-__reset_isolation_pfn.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/mm-compaction-fix-wrong-pfn-handli…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/mm-compaction-fix-wrong-pfn-handli…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Vlastimil Babka <vbabka(a)suse.cz>
Subject: mm, compaction: fix wrong pfn handling in __reset_isolation_pfn()
Florian and Dave reported [1] a NULL pointer dereference in
__reset_isolation_pfn(). While the exact cause is unclear, staring at the
code revealed two bugs, which might be related.
One bug is that if zone starts in the middle of pageblock, block_page
might correspond to different pfn than block_pfn, and then the
pfn_valid_within() checks will check different pfn's than those accessed
via struct page. This might result in acessing an unitialized page in
CONFIG_HOLES_IN_ZONE configs.
The other bug is that end_page refers to the first page of next pageblock
and not last page of current pageblock. The online and valid check is
then wrong and with sections, the while (page < end_page) loop might
wander off actual struct page arrays.
[1] https://lore.kernel.org/linux-xfs/87o8z1fvqu.fsf@mid.deneb.enyo.de/
Link: http://lkml.kernel.org/r/20191008152915.24704-1-vbabka@suse.cz
Fixes: 6b0868c820ff ("mm/compaction.c: correct zone boundary handling when resetting pageblock skip hints")
Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz>
Reported-by: Florian Weimer <fw(a)deneb.enyo.de>
Reported-by: Dave Chinner <david(a)fromorbit.com>
Acked-by: Mel Gorman <mgorman(a)techsingularity.net>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/compaction.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
--- a/mm/compaction.c~mm-compaction-fix-wrong-pfn-handling-in-__reset_isolation_pfn
+++ a/mm/compaction.c
@@ -270,14 +270,15 @@ __reset_isolation_pfn(struct zone *zone,
/* Ensure the start of the pageblock or zone is online and valid */
block_pfn = pageblock_start_pfn(pfn);
- block_page = pfn_to_online_page(max(block_pfn, zone->zone_start_pfn));
+ block_pfn = max(block_pfn, zone->zone_start_pfn);
+ block_page = pfn_to_online_page(block_pfn);
if (block_page) {
page = block_page;
pfn = block_pfn;
}
/* Ensure the end of the pageblock or zone is online and valid */
- block_pfn += pageblock_nr_pages;
+ block_pfn = pageblock_end_pfn(pfn) - 1;
block_pfn = min(block_pfn, zone_end_pfn(zone) - 1);
end_page = pfn_to_online_page(block_pfn);
if (!end_page)
@@ -303,7 +304,7 @@ __reset_isolation_pfn(struct zone *zone,
page += (1 << PAGE_ALLOC_COSTLY_ORDER);
pfn += (1 << PAGE_ALLOC_COSTLY_ORDER);
- } while (page < end_page);
+ } while (page <= end_page);
return false;
}
_
Patches currently in -mm which might be from vbabka(a)suse.cz are
mm-page_owner-fix-off-by-one-error-in-__set_page_owner_handle.patch
mm-page_owner-decouple-freeing-stack-trace-from-debug_pagealloc.patch
mm-page_owner-decouple-freeing-stack-trace-from-debug_pagealloc-v3.patch
mm-page_owner-rename-flag-indicating-that-page-is-allocated.patch
mm-compaction-fix-wrong-pfn-handling-in-__reset_isolation_pfn.patch
The patch below does not apply to the 5.3-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 576f05865581f82ac988ffec70e4e2ebd31165db Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Tue, 30 Jul 2019 12:21:51 +0100
Subject: [PATCH] drm/i915: Flush extra hard after writing relocations through
the GTT
Recently discovered in commit bdae33b8b82b ("drm/i915: Use maximum write
flush for pwrite_gtt") was that we needed to our full write barrier
before changing the GGTT PTE to ensure that our indirect writes through
the GTT landed before the PTE changed (and the writes end up in a
different page). That also applies to our GGTT relocation path.
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: stable(a)vger.kernel.org
Reviewed-by: Prathap Kumar Valsan <prathap.kumar.valsan(a)intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190730112151.5633-4-chris@c…
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index cbd7c6e3a1f8..4db4463089ce 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -1014,11 +1014,12 @@ static void reloc_cache_reset(struct reloc_cache *cache)
kunmap_atomic(vaddr);
i915_gem_object_finish_access((struct drm_i915_gem_object *)cache->node.mm);
} else {
- wmb();
+ struct i915_ggtt *ggtt = cache_to_ggtt(cache);
+
+ intel_gt_flush_ggtt_writes(ggtt->vm.gt);
io_mapping_unmap_atomic((void __iomem *)vaddr);
- if (cache->node.allocated) {
- struct i915_ggtt *ggtt = cache_to_ggtt(cache);
+ if (cache->node.allocated) {
ggtt->vm.clear_range(&ggtt->vm,
cache->node.start,
cache->node.size);
@@ -1073,6 +1074,7 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj,
void *vaddr;
if (cache->vaddr) {
+ intel_gt_flush_ggtt_writes(ggtt->vm.gt);
io_mapping_unmap_atomic((void __force __iomem *) unmask_page(cache->vaddr));
} else {
struct i915_vma *vma;
@@ -1114,7 +1116,6 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj,
offset = cache->node.start;
if (cache->node.allocated) {
- wmb();
ggtt->vm.insert_page(&ggtt->vm,
i915_gem_object_get_dma_address(obj, page),
offset, I915_CACHE_NONE, 0);
The patch below does not apply to the 5.3-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From bdae33b8b82bb379a5b11040b0b37df25c7871c9 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Thu, 18 Jul 2019 15:54:05 +0100
Subject: [PATCH] drm/i915: Use maximum write flush for pwrite_gtt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
As recently disovered by forcing big-core (!llc) machines to use the GTT
paths, we need our full GTT write flush before manipulating the GTT PTE
or else the writes may be directed to the wrong page.
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen(a)linux.intel.com>
Cc: Matthew Auld <matthew.william.auld(a)gmail.com>
Cc: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Cc: stable(a)vger.kernel.org
Reviewed-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190718145407.21352-2-chris@…
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index fed0bc421a55..c6ba350e6e4f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -610,7 +610,8 @@ i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
unsigned int page_length = PAGE_SIZE - page_offset;
page_length = remain < page_length ? remain : page_length;
if (node.allocated) {
- wmb(); /* flush the write before we modify the GGTT */
+ /* flush the write before we modify the GGTT */
+ intel_gt_flush_ggtt_writes(ggtt->vm.gt);
ggtt->vm.insert_page(&ggtt->vm,
i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
node.start, I915_CACHE_NONE, 0);
@@ -639,8 +640,8 @@ i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
i915_gem_object_unlock_fence(obj, fence);
out_unpin:
mutex_lock(&i915->drm.struct_mutex);
+ intel_gt_flush_ggtt_writes(ggtt->vm.gt);
if (node.allocated) {
- wmb();
ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
remove_mappable_node(&node);
} else {
The patch titled
Subject: mm/page_alloc.c: fix a crash in free_pages_prepare()
has been removed from the -mm tree. Its filename was
mm-page_alloc-fix-a-crash-in-free_pages_prepare.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Qian Cai <cai(a)lca.pw>
Subject: mm/page_alloc.c: fix a crash in free_pages_prepare()
On architectures like s390, arch_free_page() could mark the page unused
(set_page_unused()) and any access later would trigger a kernel panic.
Fix it by moving arch_free_page() after all possible accessing calls.
Hardware name: IBM 2964 N96 400 (z/VM 6.4.0)
Krnl PSW : 0404e00180000000 0000000026c2b96e
(__free_pages_ok+0x34e/0x5d8)
R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3
Krnl GPRS: 0000000088d43af7 0000000000484000 000000000000007c
000000000000000f
000003d080012100 000003d080013fc0 0000000000000000
0000000000100000
00000000275cca48 0000000000000100 0000000000000008
000003d080010000
00000000000001d0 000003d000000000 0000000026c2b78a
000000002717fdb0
Krnl Code: 0000000026c2b95c: ec1100b30659 risbgn %r1,%r1,0,179,6
0000000026c2b962: e32014000036 pfd 2,1024(%r1)
#0000000026c2b968: d7ff10001000 xc 0(256,%r1),0(%r1)
>0000000026c2b96e: 41101100 la %r1,256(%r1)
0000000026c2b972: a737fff8 brctg %r3,26c2b962
0000000026c2b976: d7ff10001000 xc 0(256,%r1),0(%r1)
0000000026c2b97c: e31003400004 lg %r1,832
0000000026c2b982: ebff1430016a asi 5168(%r1),-1
Call Trace:
__free_pages_ok+0x16a/0x5d8)
memblock_free_all+0x206/0x290
mem_init+0x58/0x120
start_kernel+0x2b0/0x570
startup_continue+0x6a/0xc0
INFO: lockdep is turned off.
Last Breaking-Event-Address:
__free_pages_ok+0x372/0x5d8
Kernel panic - not syncing: Fatal exception: panic_on_oops
00: HCPGIR450W CP entered; disabled wait PSW 00020001 80000000 00000000
26A2379C
In the past, only kernel_poison_pages() would trigger this but it needs
"page_poison=on" kernel cmdline, and I suspect nobody tested that on
s390. Recently, kernel_init_free_pages() (commit 6471384af2a6 ("mm:
security: introduce init_on_alloc=1 and init_on_free=1 boot options"))
was added and could trigger this as well.
[akpm(a)linux-foundation.org: add comment]
Link: http://lkml.kernel.org/r/1569613623-16820-1-git-send-email-cai@lca.pw
Fixes: 8823b1dbc05f ("mm/page_poison.c: enable PAGE_POISONING as a separate option")
Fixes: 6471384af2a6 ("mm: security: introduce init_on_alloc=1 and init_on_free=1 boot options")
Signed-off-by: Qian Cai <cai(a)lca.pw>
Reviewed-by: Heiko Carstens <heiko.carstens(a)de.ibm.com>
Acked-by: Christian Borntraeger <borntraeger(a)de.ibm.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: "Kirill A. Shutemov" <kirill(a)shutemov.name>
Cc: Vasily Gorbik <gor(a)linux.ibm.com>
Cc: Alexander Duyck <alexander.duyck(a)gmail.com>
Cc: <stable(a)vger.kernel.org> [5.3+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_alloc.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
--- a/mm/page_alloc.c~mm-page_alloc-fix-a-crash-in-free_pages_prepare
+++ a/mm/page_alloc.c
@@ -1175,11 +1175,17 @@ static __always_inline bool free_pages_p
debug_check_no_obj_freed(page_address(page),
PAGE_SIZE << order);
}
- arch_free_page(page, order);
if (want_init_on_free())
kernel_init_free_pages(page, 1 << order);
kernel_poison_pages(page, 1 << order, 0);
+ /*
+ * arch_free_page() can make the page's contents inaccessible. s390
+ * does this. So nothing which can access the page's contents should
+ * happen after this.
+ */
+ arch_free_page(page, order);
+
if (debug_pagealloc_enabled())
kernel_map_pages(page, 1 << order, 0);
_
Patches currently in -mm which might be from cai(a)lca.pw are
mm-slub-fix-a-deadlock-in-show_slab_objects.patch
The patch titled
Subject: mm/z3fold.c: claim page in the beginning of free
has been removed from the -mm tree. Its filename was
z3fold-claim-page-in-the-beginning-of-free.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Vitaly Wool <vitalywool(a)gmail.com>
Subject: mm/z3fold.c: claim page in the beginning of free
There's a really hard to reproduce race in z3fold between z3fold_free()
and z3fold_reclaim_page(). z3fold_reclaim_page() can claim the page after
z3fold_free() has checked if the page was claimed and z3fold_free() will
then schedule this page for compaction which may in turn lead to random
page faults (since that page would have been reclaimed by then). Fix that
by claiming page in the beginning of z3fold_free() and not forgetting to
clear the claim in the end.
[vitalywool(a)gmail.com: v2]
Link: http://lkml.kernel.org/r/20190928113456.152742cf@bigdell
Link: http://lkml.kernel.org/r/20190926104844.4f0c6efa1366b8f5741eaba9@gmail.com
Signed-off-by: Vitaly Wool <vitalywool(a)gmail.com>
Reported-by: Markus Linnala <markus.linnala(a)gmail.com>
Cc: Dan Streetman <ddstreet(a)ieee.org>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Henry Burns <henrywolfeburns(a)gmail.com>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Markus Linnala <markus.linnala(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/z3fold.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
--- a/mm/z3fold.c~z3fold-claim-page-in-the-beginning-of-free
+++ a/mm/z3fold.c
@@ -998,9 +998,11 @@ static void z3fold_free(struct z3fold_po
struct z3fold_header *zhdr;
struct page *page;
enum buddy bud;
+ bool page_claimed;
zhdr = handle_to_z3fold_header(handle);
page = virt_to_page(zhdr);
+ page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
if (test_bit(PAGE_HEADLESS, &page->private)) {
/* if a headless page is under reclaim, just leave.
@@ -1008,7 +1010,7 @@ static void z3fold_free(struct z3fold_po
* has not been set before, we release this page
* immediately so we don't care about its value any more.
*/
- if (!test_and_set_bit(PAGE_CLAIMED, &page->private)) {
+ if (!page_claimed) {
spin_lock(&pool->lock);
list_del(&page->lru);
spin_unlock(&pool->lock);
@@ -1044,13 +1046,15 @@ static void z3fold_free(struct z3fold_po
atomic64_dec(&pool->pages_nr);
return;
}
- if (test_bit(PAGE_CLAIMED, &page->private)) {
+ if (page_claimed) {
+ /* the page has not been claimed by us */
z3fold_page_unlock(zhdr);
return;
}
if (unlikely(PageIsolated(page)) ||
test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
z3fold_page_unlock(zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
return;
}
if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
@@ -1060,10 +1064,12 @@ static void z3fold_free(struct z3fold_po
zhdr->cpu = -1;
kref_get(&zhdr->refcount);
do_compact_page(zhdr, true);
+ clear_bit(PAGE_CLAIMED, &page->private);
return;
}
kref_get(&zhdr->refcount);
queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
+ clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}
_
Patches currently in -mm which might be from vitalywool(a)gmail.com are
The patch titled
Subject: kernel/sysctl.c: do not override max_threads provided by userspace
has been removed from the -mm tree. Its filename was
kernel-sysctlc-do-not-override-max_threads-provided-by-userspace.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Michal Hocko <mhocko(a)suse.com>
Subject: kernel/sysctl.c: do not override max_threads provided by userspace
Partially revert 16db3d3f1170 ("kernel/sysctl.c: threads-max observe
limits") because the patch is causing a regression to any workload which
needs to override the auto-tuning of the limit provided by kernel.
set_max_threads is implementing a boot time guesstimate to provide a
sensible limit of the concurrently running threads so that runaways will
not deplete all the memory. This is a good thing in general but there are
workloads which might need to increase this limit for an application to
run (reportedly WebSpher MQ is affected) and that is simply not possible
after the mentioned change. It is also very dubious to override an admin
decision by an estimation that doesn't have any direct relation to
correctness of the kernel operation.
Fix this by dropping set_max_threads from sysctl_max_threads so any value
is accepted as long as it fits into MAX_THREADS which is important to
check because allowing more threads could break internal robust futex
restriction. While at it, do not use MIN_THREADS as the lower boundary
because it is also only a heuristic for automatic estimation and admin
might have a good reason to stop new threads to be created even when below
this limit.
This became more severe when we switched x86 from 4k to 8k kernel stacks.
Starting since 6538b8ea886e ("x86_64: expand kernel stack to 16K") (3.16)
we use THREAD_SIZE_ORDER = 2 and that halved the auto-tuned value.
In the particular case
3.12
kernel.threads-max = 515561
4.4
kernel.threads-max = 200000
Neither of the two values is really insane on 32GB machine.
I am not sure we want/need to tune the max_thread value further. If
anything the tuning should be removed altogether if proven not useful in
general. But we definitely need a way to override this auto-tuning.
Link: http://lkml.kernel.org/r/20190922065801.GB18814@dhcp22.suse.cz
Fixes: 16db3d3f1170 ("kernel/sysctl.c: threads-max observe limits")
Signed-off-by: Michal Hocko <mhocko(a)suse.com>
Reviewed-by: "Eric W. Biederman" <ebiederm(a)xmission.com>
Cc: Heinrich Schuchardt <xypron.glpk(a)gmx.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/fork.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/kernel/fork.c~kernel-sysctlc-do-not-override-max_threads-provided-by-userspace
+++ a/kernel/fork.c
@@ -2925,7 +2925,7 @@ int sysctl_max_threads(struct ctl_table
struct ctl_table t;
int ret;
int threads = max_threads;
- int min = MIN_THREADS;
+ int min = 1;
int max = MAX_THREADS;
t = *table;
@@ -2937,7 +2937,7 @@ int sysctl_max_threads(struct ctl_table
if (ret || !write)
return ret;
- set_max_threads(threads);
+ max_threads = threads;
return 0;
}
_
Patches currently in -mm which might be from mhocko(a)suse.com are
The patch titled
Subject: panic: ensure preemption is disabled during panic()
has been removed from the -mm tree. Its filename was
panic-ensure-preemption-is-disabled-during-panic.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Will Deacon <will(a)kernel.org>
Subject: panic: ensure preemption is disabled during panic()
Calling 'panic()' on a kernel with CONFIG_PREEMPT=y can leave the calling
CPU in an infinite loop, but with interrupts and preemption enabled. From
this state, userspace can continue to be scheduled, despite the system
being "dead" as far as the kernel is concerned. This is easily
reproducible on arm64 when booting with "nosmp" on the command line; a
couple of shell scripts print out a periodic "Ping" message whilst another
triggers a crash by writing to /proc/sysrq-trigger:
| sysrq: Trigger a crash
| Kernel panic - not syncing: sysrq triggered crash
| CPU: 0 PID: 1 Comm: init Not tainted 5.2.15 #1
| Hardware name: linux,dummy-virt (DT)
| Call trace:
| dump_backtrace+0x0/0x148
| show_stack+0x14/0x20
| dump_stack+0xa0/0xc4
| panic+0x140/0x32c
| sysrq_handle_reboot+0x0/0x20
| __handle_sysrq+0x124/0x190
| write_sysrq_trigger+0x64/0x88
| proc_reg_write+0x60/0xa8
| __vfs_write+0x18/0x40
| vfs_write+0xa4/0x1b8
| ksys_write+0x64/0xf0
| __arm64_sys_write+0x14/0x20
| el0_svc_common.constprop.0+0xb0/0x168
| el0_svc_handler+0x28/0x78
| el0_svc+0x8/0xc
| Kernel Offset: disabled
| CPU features: 0x0002,24002004
| Memory Limit: none
| ---[ end Kernel panic - not syncing: sysrq triggered crash ]---
| Ping 2!
| Ping 1!
| Ping 1!
| Ping 2!
The issue can also be triggered on x86 kernels if CONFIG_SMP=n, otherwise
local interrupts are disabled in 'smp_send_stop()'.
Disable preemption in 'panic()' before re-enabling interrupts.
Link: http://lkml.kernel.org/r/20191002123538.22609-1-will@kernel.org
Link: https://lore.kernel.org/r/BX1W47JXPMR8.58IYW53H6M5N@dragonstone
Signed-off-by: Will Deacon <will(a)kernel.org>
Reported-by: Xogium <contact(a)xogium.me>
Reviewed-by: Kees Cook <keescook(a)chromium.org>
Cc: Russell King <linux(a)armlinux.org.uk>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Petr Mladek <pmladek(a)suse.com>
Cc: Feng Tang <feng.tang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/panic.c | 1 +
1 file changed, 1 insertion(+)
--- a/kernel/panic.c~panic-ensure-preemption-is-disabled-during-panic
+++ a/kernel/panic.c
@@ -180,6 +180,7 @@ void panic(const char *fmt, ...)
* after setting panic_cpu) from invoking panic() again.
*/
local_irq_disable();
+ preempt_disable_notrace();
/*
* It's possible to come here directly from a panic-assertion and
_
Patches currently in -mm which might be from will(a)kernel.org are
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From f88eb7c0d002a67ef31aeb7850b42ff69abc46dc Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg(a)intel.com>
Date: Fri, 20 Sep 2019 21:54:17 +0200
Subject: [PATCH] nl80211: validate beacon head
We currently don't validate the beacon head, i.e. the header,
fixed part and elements that are to go in front of the TIM
element. This means that the variable elements there can be
malformed, e.g. have a length exceeding the buffer size, but
most downstream code from this assumes that this has already
been checked.
Add the necessary checks to the netlink policy.
Cc: stable(a)vger.kernel.org
Fixes: ed1b6cc7f80f ("cfg80211/nl80211: add beacon settings")
Link: https://lore.kernel.org/r/1569009255-I7ac7fbe9436e9d8733439eab8acbbd35e55c7…
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d21b1581a665..7386421e2ad3 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -201,6 +201,38 @@ cfg80211_get_dev_from_info(struct net *netns, struct genl_info *info)
return __cfg80211_rdev_from_attrs(netns, info->attrs);
}
+static int validate_beacon_head(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const u8 *data = nla_data(attr);
+ unsigned int len = nla_len(attr);
+ const struct element *elem;
+ const struct ieee80211_mgmt *mgmt = (void *)data;
+ unsigned int fixedlen = offsetof(struct ieee80211_mgmt,
+ u.beacon.variable);
+
+ if (len < fixedlen)
+ goto err;
+
+ if (ieee80211_hdrlen(mgmt->frame_control) !=
+ offsetof(struct ieee80211_mgmt, u.beacon))
+ goto err;
+
+ data += fixedlen;
+ len -= fixedlen;
+
+ for_each_element(elem, data, len) {
+ /* nothing */
+ }
+
+ if (for_each_element_completed(elem, data, len))
+ return 0;
+
+err:
+ NL_SET_ERR_MSG_ATTR(extack, attr, "malformed beacon head");
+ return -EINVAL;
+}
+
static int validate_ie_attr(const struct nlattr *attr,
struct netlink_ext_ack *extack)
{
@@ -338,8 +370,9 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 },
[NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 },
- [NL80211_ATTR_BEACON_HEAD] = { .type = NLA_BINARY,
- .len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_ATTR_BEACON_HEAD] =
+ NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_beacon_head,
+ IEEE80211_MAX_DATA_LEN),
[NL80211_ATTR_BEACON_TAIL] =
NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
IEEE80211_MAX_DATA_LEN),
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From f88eb7c0d002a67ef31aeb7850b42ff69abc46dc Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg(a)intel.com>
Date: Fri, 20 Sep 2019 21:54:17 +0200
Subject: [PATCH] nl80211: validate beacon head
We currently don't validate the beacon head, i.e. the header,
fixed part and elements that are to go in front of the TIM
element. This means that the variable elements there can be
malformed, e.g. have a length exceeding the buffer size, but
most downstream code from this assumes that this has already
been checked.
Add the necessary checks to the netlink policy.
Cc: stable(a)vger.kernel.org
Fixes: ed1b6cc7f80f ("cfg80211/nl80211: add beacon settings")
Link: https://lore.kernel.org/r/1569009255-I7ac7fbe9436e9d8733439eab8acbbd35e55c7…
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d21b1581a665..7386421e2ad3 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -201,6 +201,38 @@ cfg80211_get_dev_from_info(struct net *netns, struct genl_info *info)
return __cfg80211_rdev_from_attrs(netns, info->attrs);
}
+static int validate_beacon_head(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const u8 *data = nla_data(attr);
+ unsigned int len = nla_len(attr);
+ const struct element *elem;
+ const struct ieee80211_mgmt *mgmt = (void *)data;
+ unsigned int fixedlen = offsetof(struct ieee80211_mgmt,
+ u.beacon.variable);
+
+ if (len < fixedlen)
+ goto err;
+
+ if (ieee80211_hdrlen(mgmt->frame_control) !=
+ offsetof(struct ieee80211_mgmt, u.beacon))
+ goto err;
+
+ data += fixedlen;
+ len -= fixedlen;
+
+ for_each_element(elem, data, len) {
+ /* nothing */
+ }
+
+ if (for_each_element_completed(elem, data, len))
+ return 0;
+
+err:
+ NL_SET_ERR_MSG_ATTR(extack, attr, "malformed beacon head");
+ return -EINVAL;
+}
+
static int validate_ie_attr(const struct nlattr *attr,
struct netlink_ext_ack *extack)
{
@@ -338,8 +370,9 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 },
[NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 },
- [NL80211_ATTR_BEACON_HEAD] = { .type = NLA_BINARY,
- .len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_ATTR_BEACON_HEAD] =
+ NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_beacon_head,
+ IEEE80211_MAX_DATA_LEN),
[NL80211_ATTR_BEACON_TAIL] =
NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
IEEE80211_MAX_DATA_LEN),
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From f88eb7c0d002a67ef31aeb7850b42ff69abc46dc Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg(a)intel.com>
Date: Fri, 20 Sep 2019 21:54:17 +0200
Subject: [PATCH] nl80211: validate beacon head
We currently don't validate the beacon head, i.e. the header,
fixed part and elements that are to go in front of the TIM
element. This means that the variable elements there can be
malformed, e.g. have a length exceeding the buffer size, but
most downstream code from this assumes that this has already
been checked.
Add the necessary checks to the netlink policy.
Cc: stable(a)vger.kernel.org
Fixes: ed1b6cc7f80f ("cfg80211/nl80211: add beacon settings")
Link: https://lore.kernel.org/r/1569009255-I7ac7fbe9436e9d8733439eab8acbbd35e55c7…
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d21b1581a665..7386421e2ad3 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -201,6 +201,38 @@ cfg80211_get_dev_from_info(struct net *netns, struct genl_info *info)
return __cfg80211_rdev_from_attrs(netns, info->attrs);
}
+static int validate_beacon_head(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const u8 *data = nla_data(attr);
+ unsigned int len = nla_len(attr);
+ const struct element *elem;
+ const struct ieee80211_mgmt *mgmt = (void *)data;
+ unsigned int fixedlen = offsetof(struct ieee80211_mgmt,
+ u.beacon.variable);
+
+ if (len < fixedlen)
+ goto err;
+
+ if (ieee80211_hdrlen(mgmt->frame_control) !=
+ offsetof(struct ieee80211_mgmt, u.beacon))
+ goto err;
+
+ data += fixedlen;
+ len -= fixedlen;
+
+ for_each_element(elem, data, len) {
+ /* nothing */
+ }
+
+ if (for_each_element_completed(elem, data, len))
+ return 0;
+
+err:
+ NL_SET_ERR_MSG_ATTR(extack, attr, "malformed beacon head");
+ return -EINVAL;
+}
+
static int validate_ie_attr(const struct nlattr *attr,
struct netlink_ext_ack *extack)
{
@@ -338,8 +370,9 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 },
[NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 },
- [NL80211_ATTR_BEACON_HEAD] = { .type = NLA_BINARY,
- .len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_ATTR_BEACON_HEAD] =
+ NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_beacon_head,
+ IEEE80211_MAX_DATA_LEN),
[NL80211_ATTR_BEACON_TAIL] =
NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
IEEE80211_MAX_DATA_LEN),
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From f88eb7c0d002a67ef31aeb7850b42ff69abc46dc Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg(a)intel.com>
Date: Fri, 20 Sep 2019 21:54:17 +0200
Subject: [PATCH] nl80211: validate beacon head
We currently don't validate the beacon head, i.e. the header,
fixed part and elements that are to go in front of the TIM
element. This means that the variable elements there can be
malformed, e.g. have a length exceeding the buffer size, but
most downstream code from this assumes that this has already
been checked.
Add the necessary checks to the netlink policy.
Cc: stable(a)vger.kernel.org
Fixes: ed1b6cc7f80f ("cfg80211/nl80211: add beacon settings")
Link: https://lore.kernel.org/r/1569009255-I7ac7fbe9436e9d8733439eab8acbbd35e55c7…
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d21b1581a665..7386421e2ad3 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -201,6 +201,38 @@ cfg80211_get_dev_from_info(struct net *netns, struct genl_info *info)
return __cfg80211_rdev_from_attrs(netns, info->attrs);
}
+static int validate_beacon_head(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const u8 *data = nla_data(attr);
+ unsigned int len = nla_len(attr);
+ const struct element *elem;
+ const struct ieee80211_mgmt *mgmt = (void *)data;
+ unsigned int fixedlen = offsetof(struct ieee80211_mgmt,
+ u.beacon.variable);
+
+ if (len < fixedlen)
+ goto err;
+
+ if (ieee80211_hdrlen(mgmt->frame_control) !=
+ offsetof(struct ieee80211_mgmt, u.beacon))
+ goto err;
+
+ data += fixedlen;
+ len -= fixedlen;
+
+ for_each_element(elem, data, len) {
+ /* nothing */
+ }
+
+ if (for_each_element_completed(elem, data, len))
+ return 0;
+
+err:
+ NL_SET_ERR_MSG_ATTR(extack, attr, "malformed beacon head");
+ return -EINVAL;
+}
+
static int validate_ie_attr(const struct nlattr *attr,
struct netlink_ext_ack *extack)
{
@@ -338,8 +370,9 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 },
[NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 },
- [NL80211_ATTR_BEACON_HEAD] = { .type = NLA_BINARY,
- .len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_ATTR_BEACON_HEAD] =
+ NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_beacon_head,
+ IEEE80211_MAX_DATA_LEN),
[NL80211_ATTR_BEACON_TAIL] =
NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
IEEE80211_MAX_DATA_LEN),
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 4ee7dde4c777f14cb0f98dd201491bf6cc15899b Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter(a)intel.com>
Date: Mon, 23 Sep 2019 12:08:09 +0200
Subject: [PATCH] mmc: sdhci: Let drivers define their DMA mask
Add host operation ->set_dma_mask() so that drivers can define their own
DMA masks.
Signed-off-by: Adrian Hunter <adrian.hunter(a)intel.com>
Tested-by: Nicolin Chen <nicoleotsuka(a)gmail.com>
Signed-off-by: Thierry Reding <treding(a)nvidia.com>
Cc: stable(a)vger.kernel.org # v4.15 +
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 922a5b594c5e..b056400e34b1 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -3781,18 +3781,14 @@ int sdhci_setup_host(struct sdhci_host *host)
host->flags &= ~SDHCI_USE_ADMA;
}
- /*
- * It is assumed that a 64-bit capable device has set a 64-bit DMA mask
- * and *must* do 64-bit DMA. A driver has the opportunity to change
- * that during the first call to ->enable_dma(). Similarly
- * SDHCI_QUIRK2_BROKEN_64_BIT_DMA must be left to the drivers to
- * implement.
- */
if (sdhci_can_64bit_dma(host))
host->flags |= SDHCI_USE_64_BIT_DMA;
if (host->flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA)) {
- ret = sdhci_set_dma_mask(host);
+ if (host->ops->set_dma_mask)
+ ret = host->ops->set_dma_mask(host);
+ else
+ ret = sdhci_set_dma_mask(host);
if (!ret && host->ops->enable_dma)
ret = host->ops->enable_dma(host);
diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h
index a29c4cd2d92e..0ed3e0eaef5f 100644
--- a/drivers/mmc/host/sdhci.h
+++ b/drivers/mmc/host/sdhci.h
@@ -622,6 +622,7 @@ struct sdhci_ops {
u32 (*irq)(struct sdhci_host *host, u32 intmask);
+ int (*set_dma_mask)(struct sdhci_host *host);
int (*enable_dma)(struct sdhci_host *host);
unsigned int (*get_max_clock)(struct sdhci_host *host);
unsigned int (*get_min_clock)(struct sdhci_host *host);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b960bc448a252428bacca271f3416a8bda3b599b Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicoleotsuka(a)gmail.com>
Date: Mon, 23 Sep 2019 12:08:10 +0200
Subject: [PATCH] mmc: tegra: Implement ->set_dma_mask()
The SDHCI controller on Tegra186 supports 40-bit addressing, which is
usually enough to address all of system memory. However, if the SDHCI
controller is behind an IOMMU, the address space can go beyond. This
happens on Tegra186 and later where the ARM SMMU has an input address
space of 48 bits. If the DMA API is backed by this ARM SMMU, the top-
down IOVA allocator will cause IOV addresses to be returned that the
SDHCI controller cannot access.
Unfortunately, prior to the introduction of the ->set_dma_mask() host
operation, the SDHCI core would set either a 64-bit DMA mask if the
controller claimed to support 64-bit addressing, or a 32-bit DMA mask
otherwise.
Since the full 64 bits cannot be addressed on Tegra, this had to be
worked around in commit 68481a7e1c84 ("mmc: tegra: Mark 64 bit dma
broken on Tegra186") by setting the SDHCI_QUIRK2_BROKEN_64_BIT_DMA
quirk, which effectively restricts the DMA mask to 32 bits.
One disadvantage of this is that dma_map_*() APIs will now try to use
the swiotlb to bounce DMA to addresses beyond of the controller's DMA
mask. This in turn caused degraded performance and can lead to
situations where the swiotlb buffer is exhausted, which in turn leads
to DMA transfers to fail.
With the recent introduction of the ->set_dma_mask() host operation,
this can now be properly fixed. For each generation of Tegra, the exact
supported DMA mask can be configured. This kills two birds with one
stone: it avoids the use of bounce buffers because system memory never
exceeds the addressable memory range of the SDHCI controllers on these
devices, and at the same time when an IOMMU is involved, it prevents
IOV addresses from being allocated beyond the addressible range of the
controllers.
Since the DMA mask is now properly handled, the 64-bit DMA quirk can be
removed.
Signed-off-by: Nicolin Chen <nicoleotsuka(a)gmail.com>
[treding(a)nvidia.com: provide more background in commit message]
Tested-by: Nicolin Chen <nicoleotsuka(a)gmail.com>
Acked-by: Adrian Hunter <adrian.hunter(a)intel.com>
Signed-off-by: Thierry Reding <treding(a)nvidia.com>
Cc: stable(a)vger.kernel.org # v4.15 +
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c
index 02d8f524bb9e..7bc950520fd9 100644
--- a/drivers/mmc/host/sdhci-tegra.c
+++ b/drivers/mmc/host/sdhci-tegra.c
@@ -4,6 +4,7 @@
*/
#include <linux/delay.h>
+#include <linux/dma-mapping.h>
#include <linux/err.h>
#include <linux/module.h>
#include <linux/init.h>
@@ -104,6 +105,7 @@
struct sdhci_tegra_soc_data {
const struct sdhci_pltfm_data *pdata;
+ u64 dma_mask;
u32 nvquirks;
u8 min_tap_delay;
u8 max_tap_delay;
@@ -1233,11 +1235,25 @@ static const struct cqhci_host_ops sdhci_tegra_cqhci_ops = {
.update_dcmd_desc = sdhci_tegra_update_dcmd_desc,
};
+static int tegra_sdhci_set_dma_mask(struct sdhci_host *host)
+{
+ struct sdhci_pltfm_host *platform = sdhci_priv(host);
+ struct sdhci_tegra *tegra = sdhci_pltfm_priv(platform);
+ const struct sdhci_tegra_soc_data *soc = tegra->soc_data;
+ struct device *dev = mmc_dev(host->mmc);
+
+ if (soc->dma_mask)
+ return dma_set_mask_and_coherent(dev, soc->dma_mask);
+
+ return 0;
+}
+
static const struct sdhci_ops tegra_sdhci_ops = {
.get_ro = tegra_sdhci_get_ro,
.read_w = tegra_sdhci_readw,
.write_l = tegra_sdhci_writel,
.set_clock = tegra_sdhci_set_clock,
+ .set_dma_mask = tegra_sdhci_set_dma_mask,
.set_bus_width = sdhci_set_bus_width,
.reset = tegra_sdhci_reset,
.platform_execute_tuning = tegra_sdhci_execute_tuning,
@@ -1257,6 +1273,7 @@ static const struct sdhci_pltfm_data sdhci_tegra20_pdata = {
static const struct sdhci_tegra_soc_data soc_data_tegra20 = {
.pdata = &sdhci_tegra20_pdata,
+ .dma_mask = DMA_BIT_MASK(32),
.nvquirks = NVQUIRK_FORCE_SDHCI_SPEC_200 |
NVQUIRK_ENABLE_BLOCK_GAP_DET,
};
@@ -1283,6 +1300,7 @@ static const struct sdhci_pltfm_data sdhci_tegra30_pdata = {
static const struct sdhci_tegra_soc_data soc_data_tegra30 = {
.pdata = &sdhci_tegra30_pdata,
+ .dma_mask = DMA_BIT_MASK(32),
.nvquirks = NVQUIRK_ENABLE_SDHCI_SPEC_300 |
NVQUIRK_ENABLE_SDR50 |
NVQUIRK_ENABLE_SDR104 |
@@ -1295,6 +1313,7 @@ static const struct sdhci_ops tegra114_sdhci_ops = {
.write_w = tegra_sdhci_writew,
.write_l = tegra_sdhci_writel,
.set_clock = tegra_sdhci_set_clock,
+ .set_dma_mask = tegra_sdhci_set_dma_mask,
.set_bus_width = sdhci_set_bus_width,
.reset = tegra_sdhci_reset,
.platform_execute_tuning = tegra_sdhci_execute_tuning,
@@ -1316,6 +1335,7 @@ static const struct sdhci_pltfm_data sdhci_tegra114_pdata = {
static const struct sdhci_tegra_soc_data soc_data_tegra114 = {
.pdata = &sdhci_tegra114_pdata,
+ .dma_mask = DMA_BIT_MASK(32),
};
static const struct sdhci_pltfm_data sdhci_tegra124_pdata = {
@@ -1325,22 +1345,13 @@ static const struct sdhci_pltfm_data sdhci_tegra124_pdata = {
SDHCI_QUIRK_NO_HISPD_BIT |
SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC |
SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN,
- .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN |
- /*
- * The TRM states that the SD/MMC controller found on
- * Tegra124 can address 34 bits (the maximum supported by
- * the Tegra memory controller), but tests show that DMA
- * to or from above 4 GiB doesn't work. This is possibly
- * caused by missing programming, though it's not obvious
- * what sequence is required. Mark 64-bit DMA broken for
- * now to fix this for existing users (e.g. Nyan boards).
- */
- SDHCI_QUIRK2_BROKEN_64_BIT_DMA,
+ .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN,
.ops = &tegra114_sdhci_ops,
};
static const struct sdhci_tegra_soc_data soc_data_tegra124 = {
.pdata = &sdhci_tegra124_pdata,
+ .dma_mask = DMA_BIT_MASK(34),
};
static const struct sdhci_ops tegra210_sdhci_ops = {
@@ -1349,6 +1360,7 @@ static const struct sdhci_ops tegra210_sdhci_ops = {
.write_w = tegra210_sdhci_writew,
.write_l = tegra_sdhci_writel,
.set_clock = tegra_sdhci_set_clock,
+ .set_dma_mask = tegra_sdhci_set_dma_mask,
.set_bus_width = sdhci_set_bus_width,
.reset = tegra_sdhci_reset,
.set_uhs_signaling = tegra_sdhci_set_uhs_signaling,
@@ -1369,6 +1381,7 @@ static const struct sdhci_pltfm_data sdhci_tegra210_pdata = {
static const struct sdhci_tegra_soc_data soc_data_tegra210 = {
.pdata = &sdhci_tegra210_pdata,
+ .dma_mask = DMA_BIT_MASK(34),
.nvquirks = NVQUIRK_NEEDS_PAD_CONTROL |
NVQUIRK_HAS_PADCALIB |
NVQUIRK_DIS_CARD_CLK_CONFIG_TAP |
@@ -1383,6 +1396,7 @@ static const struct sdhci_ops tegra186_sdhci_ops = {
.read_w = tegra_sdhci_readw,
.write_l = tegra_sdhci_writel,
.set_clock = tegra_sdhci_set_clock,
+ .set_dma_mask = tegra_sdhci_set_dma_mask,
.set_bus_width = sdhci_set_bus_width,
.reset = tegra_sdhci_reset,
.set_uhs_signaling = tegra_sdhci_set_uhs_signaling,
@@ -1398,20 +1412,13 @@ static const struct sdhci_pltfm_data sdhci_tegra186_pdata = {
SDHCI_QUIRK_NO_HISPD_BIT |
SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC |
SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN,
- .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN |
- /* SDHCI controllers on Tegra186 support 40-bit addressing.
- * IOVA addresses are 48-bit wide on Tegra186.
- * With 64-bit dma mask used for SDHCI, accesses can
- * be broken. Disable 64-bit dma, which would fall back
- * to 32-bit dma mask. Ideally 40-bit dma mask would work,
- * But it is not supported as of now.
- */
- SDHCI_QUIRK2_BROKEN_64_BIT_DMA,
+ .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN,
.ops = &tegra186_sdhci_ops,
};
static const struct sdhci_tegra_soc_data soc_data_tegra186 = {
.pdata = &sdhci_tegra186_pdata,
+ .dma_mask = DMA_BIT_MASK(40),
.nvquirks = NVQUIRK_NEEDS_PAD_CONTROL |
NVQUIRK_HAS_PADCALIB |
NVQUIRK_DIS_CARD_CLK_CONFIG_TAP |
@@ -1424,6 +1431,7 @@ static const struct sdhci_tegra_soc_data soc_data_tegra186 = {
static const struct sdhci_tegra_soc_data soc_data_tegra194 = {
.pdata = &sdhci_tegra186_pdata,
+ .dma_mask = DMA_BIT_MASK(39),
.nvquirks = NVQUIRK_NEEDS_PAD_CONTROL |
NVQUIRK_HAS_PADCALIB |
NVQUIRK_DIS_CARD_CLK_CONFIG_TAP |
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From c82dd6d078a2bb29d41eda032bb96d05699a524d Mon Sep 17 00:00:00 2001
From: Vincent Chen <vincent.chen(a)sifive.com>
Date: Mon, 16 Sep 2019 16:47:41 +0800
Subject: [PATCH] riscv: Avoid interrupts being erroneously enabled in
handle_exception()
When the handle_exception function addresses an exception, the interrupts
will be unconditionally enabled after finishing the context save. However,
It may erroneously enable the interrupts if the interrupts are disabled
before entering the handle_exception.
For example, one of the WARN_ON() condition is satisfied in the scheduling
where the interrupt is disabled and rq.lock is locked. The WARN_ON will
trigger a break exception and the handle_exception function will enable the
interrupts before entering do_trap_break function. During the procedure, if
a timer interrupt is pending, it will be taken when interrupts are enabled.
In this case, it may cause a deadlock problem if the rq.lock is locked
again in the timer ISR.
Hence, the handle_exception() can only enable interrupts when the state of
sstatus.SPIE is 1.
This patch is tested on HiFive Unleashed board.
Signed-off-by: Vincent Chen <vincent.chen(a)sifive.com>
Reviewed-by: Palmer Dabbelt <palmer(a)sifive.com>
[paul.walmsley(a)sifive.com: updated to apply]
Fixes: bcae803a21317 ("RISC-V: Enable IRQ during exception handling")
Cc: David Abdurachmanov <david.abdurachmanov(a)sifive.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Paul Walmsley <paul.walmsley(a)sifive.com>
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 74ccfd464071..da7aa88113c2 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -166,9 +166,13 @@ ENTRY(handle_exception)
move a0, sp /* pt_regs */
tail do_IRQ
1:
- /* Exceptions run with interrupts enabled */
+ /* Exceptions run with interrupts enabled or disabled
+ depending on the state of sstatus.SR_SPIE */
+ andi t0, s1, SR_SPIE
+ beqz t0, 1f
csrs CSR_SSTATUS, SR_SIE
+1:
/* Handle syscalls */
li t0, EXC_SYSCALL
beq s4, t0, handle_syscall
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From cb6d7c7dc7ff8cace666ddec66334117a6068ce2 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Mon, 8 Jul 2019 15:03:27 +0100
Subject: [PATCH] drm/i915/userptr: Acquire the page lock around
set_page_dirty()
set_page_dirty says:
For pages with a mapping this should be done under the page lock
for the benefit of asynchronous memory errors who prefer a
consistent dirty state. This rule can be broken in some special
cases, but should be better not to.
Under those rules, it is only safe for us to use the plain set_page_dirty
calls for shmemfs/anonymous memory. Userptr may be used with real
mappings and so needs to use the locked version (set_page_dirty_lock).
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=203317
Fixes: 5cc9ed4b9a7a ("drm/i915: Introduce mapping of user pages into video memory (userptr) ioctl")
References: 6dcc693bc57f ("ext4: warn when page is dirtied without buffers")
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Cc: stable(a)vger.kernel.org
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190708140327.26825-1-chris@…
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
index 16ccec7fb7da..32d208ede343 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -665,7 +665,15 @@ i915_gem_userptr_put_pages(struct drm_i915_gem_object *obj,
for_each_sgt_page(page, sgt_iter, pages) {
if (obj->mm.dirty)
- set_page_dirty(page);
+ /*
+ * As this may not be anonymous memory (e.g. shmem)
+ * but exist on a real mapping, we have to lock
+ * the page in order to dirty it -- holding
+ * the page reference is not sufficient to
+ * prevent the inode from being truncated.
+ * Play safe and take the lock.
+ */
+ set_page_dirty_lock(page);
mark_page_accessed(page);
put_page(page);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From cb6d7c7dc7ff8cace666ddec66334117a6068ce2 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Mon, 8 Jul 2019 15:03:27 +0100
Subject: [PATCH] drm/i915/userptr: Acquire the page lock around
set_page_dirty()
set_page_dirty says:
For pages with a mapping this should be done under the page lock
for the benefit of asynchronous memory errors who prefer a
consistent dirty state. This rule can be broken in some special
cases, but should be better not to.
Under those rules, it is only safe for us to use the plain set_page_dirty
calls for shmemfs/anonymous memory. Userptr may be used with real
mappings and so needs to use the locked version (set_page_dirty_lock).
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=203317
Fixes: 5cc9ed4b9a7a ("drm/i915: Introduce mapping of user pages into video memory (userptr) ioctl")
References: 6dcc693bc57f ("ext4: warn when page is dirtied without buffers")
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Cc: stable(a)vger.kernel.org
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190708140327.26825-1-chris@…
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
index 16ccec7fb7da..32d208ede343 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -665,7 +665,15 @@ i915_gem_userptr_put_pages(struct drm_i915_gem_object *obj,
for_each_sgt_page(page, sgt_iter, pages) {
if (obj->mm.dirty)
- set_page_dirty(page);
+ /*
+ * As this may not be anonymous memory (e.g. shmem)
+ * but exist on a real mapping, we have to lock
+ * the page in order to dirty it -- holding
+ * the page reference is not sufficient to
+ * prevent the inode from being truncated.
+ * Play safe and take the lock.
+ */
+ set_page_dirty_lock(page);
mark_page_accessed(page);
put_page(page);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From cb6d7c7dc7ff8cace666ddec66334117a6068ce2 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Mon, 8 Jul 2019 15:03:27 +0100
Subject: [PATCH] drm/i915/userptr: Acquire the page lock around
set_page_dirty()
set_page_dirty says:
For pages with a mapping this should be done under the page lock
for the benefit of asynchronous memory errors who prefer a
consistent dirty state. This rule can be broken in some special
cases, but should be better not to.
Under those rules, it is only safe for us to use the plain set_page_dirty
calls for shmemfs/anonymous memory. Userptr may be used with real
mappings and so needs to use the locked version (set_page_dirty_lock).
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=203317
Fixes: 5cc9ed4b9a7a ("drm/i915: Introduce mapping of user pages into video memory (userptr) ioctl")
References: 6dcc693bc57f ("ext4: warn when page is dirtied without buffers")
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Cc: stable(a)vger.kernel.org
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190708140327.26825-1-chris@…
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
index 16ccec7fb7da..32d208ede343 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -665,7 +665,15 @@ i915_gem_userptr_put_pages(struct drm_i915_gem_object *obj,
for_each_sgt_page(page, sgt_iter, pages) {
if (obj->mm.dirty)
- set_page_dirty(page);
+ /*
+ * As this may not be anonymous memory (e.g. shmem)
+ * but exist on a real mapping, we have to lock
+ * the page in order to dirty it -- holding
+ * the page reference is not sufficient to
+ * prevent the inode from being truncated.
+ * Play safe and take the lock.
+ */
+ set_page_dirty_lock(page);
mark_page_accessed(page);
put_page(page);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b63fd11cced17fcb8e133def29001b0f6aaa5e06 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Date: Wed, 4 Sep 2019 15:17:37 +0530
Subject: [PATCH] perf stat: Reset previous counts on repeat with interval
When using 'perf stat' with repeat and interval option, it shows wrong
values for events.
The wrong values will be shown for the first interval on the second and
subsequent repetitions.
Without the fix:
# perf stat -r 3 -I 2000 -e faults -e sched:sched_switch -a sleep 5
2.000282489 53 faults
2.000282489 513 sched:sched_switch
4.005478208 3,721 faults
4.005478208 2,666 sched:sched_switch
5.025470933 395 faults
5.025470933 1,307 sched:sched_switch
2.009602825 1,84,46,74,40,73,70,95,47,520 faults <------
2.009602825 1,84,46,74,40,73,70,95,49,568 sched:sched_switch <------
4.019612206 4,730 faults
4.019612206 2,746 sched:sched_switch
5.039615484 3,953 faults
5.039615484 1,496 sched:sched_switch
2.000274620 1,84,46,74,40,73,70,95,47,520 faults <------
2.000274620 1,84,46,74,40,73,70,95,47,520 sched:sched_switch <------
4.000480342 4,282 faults
4.000480342 2,303 sched:sched_switch
5.000916811 1,322 faults
5.000916811 1,064 sched:sched_switch
#
prev_raw_counts is allocated when using intervals. This is used when
calculating the difference in the counts of events when using interval.
The current counts are stored in prev_raw_counts to calculate the
differences in the next iteration.
On the first interval of the second and subsequent repetitions,
prev_raw_counts would be the values stored in the last interval of the
previous repetitions, while the current counts will only be for the
first interval of the current repetition.
Hence there is a possibility of events showing up as big number.
Fix this by resetting prev_raw_counts whenever perf stat repeats the
command.
With the fix:
# perf stat -r 3 -I 2000 -e faults -e sched:sched_switch -a sleep 5
2.019349347 2,597 faults
2.019349347 2,753 sched:sched_switch
4.019577372 3,098 faults
4.019577372 2,532 sched:sched_switch
5.019415481 1,879 faults
5.019415481 1,356 sched:sched_switch
2.000178813 8,468 faults
2.000178813 2,254 sched:sched_switch
4.000404621 7,440 faults
4.000404621 1,266 sched:sched_switch
5.040196079 2,458 faults
5.040196079 556 sched:sched_switch
2.000191939 6,870 faults
2.000191939 1,170 sched:sched_switch
4.000414103 541 faults
4.000414103 902 sched:sched_switch
5.000809863 450 faults
5.000809863 364 sched:sched_switch
#
Committer notes:
This was broken since the cset introducing the --interval feature, i.e.
--repeat + --interval wasn't tested at that point, add the Fixes tag so
that automatic scripts can pick this up.
Fixes: 13370a9b5bb8 ("perf stat: Add interval printing")
Signed-off-by: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Acked-by: Jiri Olsa <jolsa(a)kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
Tested-by: Ravi Bangoria <ravi.bangoria(a)linux.ibm.com>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
Cc: Stephane Eranian <eranian(a)google.com>
Cc: stable(a)vger.kernel.org # v3.9+
Link: http://lore.kernel.org/lkml/20190904094738.9558-2-srikar@linux.vnet.ibm.com
[ Fixed up conflicts with libperf, i.e. some perf_{evsel,evlist} lost the 'perf' prefix ]
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index eece3d1e429a..fa4b148ecfca 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1952,6 +1952,9 @@ int cmd_stat(int argc, const char **argv)
fprintf(output, "[ perf stat: executing run #%d ... ]\n",
run_idx + 1);
+ if (run_idx != 0)
+ perf_evlist__reset_prev_raw_counts(evsel_list);
+
status = run_perf_stat(argc, argv, run_idx);
if (forever && status != -1) {
print_counters(NULL, argc, argv);
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 06571209cb0b..fcd54342c04c 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -162,6 +162,15 @@ static void perf_evsel__free_prev_raw_counts(struct evsel *evsel)
evsel->prev_raw_counts = NULL;
}
+static void perf_evsel__reset_prev_raw_counts(struct evsel *evsel)
+{
+ if (evsel->prev_raw_counts) {
+ evsel->prev_raw_counts->aggr.val = 0;
+ evsel->prev_raw_counts->aggr.ena = 0;
+ evsel->prev_raw_counts->aggr.run = 0;
+ }
+}
+
static int perf_evsel__alloc_stats(struct evsel *evsel, bool alloc_raw)
{
int ncpus = perf_evsel__nr_cpus(evsel);
@@ -212,6 +221,14 @@ void perf_evlist__reset_stats(struct evlist *evlist)
}
}
+void perf_evlist__reset_prev_raw_counts(struct evlist *evlist)
+{
+ struct evsel *evsel;
+
+ evlist__for_each_entry(evlist, evsel)
+ perf_evsel__reset_prev_raw_counts(evsel);
+}
+
static void zero_per_pkg(struct evsel *counter)
{
if (counter->per_pkg_mask)
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 0f9c9f6e2041..edbeb2f63e8d 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -193,6 +193,7 @@ void perf_stat__collect_metric_expr(struct evlist *);
int perf_evlist__alloc_stats(struct evlist *evlist, bool alloc_raw);
void perf_evlist__free_stats(struct evlist *evlist);
void perf_evlist__reset_stats(struct evlist *evlist);
+void perf_evlist__reset_prev_raw_counts(struct evlist *evlist);
int perf_stat_process_counter(struct perf_stat_config *config,
struct evsel *counter);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b63fd11cced17fcb8e133def29001b0f6aaa5e06 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Date: Wed, 4 Sep 2019 15:17:37 +0530
Subject: [PATCH] perf stat: Reset previous counts on repeat with interval
When using 'perf stat' with repeat and interval option, it shows wrong
values for events.
The wrong values will be shown for the first interval on the second and
subsequent repetitions.
Without the fix:
# perf stat -r 3 -I 2000 -e faults -e sched:sched_switch -a sleep 5
2.000282489 53 faults
2.000282489 513 sched:sched_switch
4.005478208 3,721 faults
4.005478208 2,666 sched:sched_switch
5.025470933 395 faults
5.025470933 1,307 sched:sched_switch
2.009602825 1,84,46,74,40,73,70,95,47,520 faults <------
2.009602825 1,84,46,74,40,73,70,95,49,568 sched:sched_switch <------
4.019612206 4,730 faults
4.019612206 2,746 sched:sched_switch
5.039615484 3,953 faults
5.039615484 1,496 sched:sched_switch
2.000274620 1,84,46,74,40,73,70,95,47,520 faults <------
2.000274620 1,84,46,74,40,73,70,95,47,520 sched:sched_switch <------
4.000480342 4,282 faults
4.000480342 2,303 sched:sched_switch
5.000916811 1,322 faults
5.000916811 1,064 sched:sched_switch
#
prev_raw_counts is allocated when using intervals. This is used when
calculating the difference in the counts of events when using interval.
The current counts are stored in prev_raw_counts to calculate the
differences in the next iteration.
On the first interval of the second and subsequent repetitions,
prev_raw_counts would be the values stored in the last interval of the
previous repetitions, while the current counts will only be for the
first interval of the current repetition.
Hence there is a possibility of events showing up as big number.
Fix this by resetting prev_raw_counts whenever perf stat repeats the
command.
With the fix:
# perf stat -r 3 -I 2000 -e faults -e sched:sched_switch -a sleep 5
2.019349347 2,597 faults
2.019349347 2,753 sched:sched_switch
4.019577372 3,098 faults
4.019577372 2,532 sched:sched_switch
5.019415481 1,879 faults
5.019415481 1,356 sched:sched_switch
2.000178813 8,468 faults
2.000178813 2,254 sched:sched_switch
4.000404621 7,440 faults
4.000404621 1,266 sched:sched_switch
5.040196079 2,458 faults
5.040196079 556 sched:sched_switch
2.000191939 6,870 faults
2.000191939 1,170 sched:sched_switch
4.000414103 541 faults
4.000414103 902 sched:sched_switch
5.000809863 450 faults
5.000809863 364 sched:sched_switch
#
Committer notes:
This was broken since the cset introducing the --interval feature, i.e.
--repeat + --interval wasn't tested at that point, add the Fixes tag so
that automatic scripts can pick this up.
Fixes: 13370a9b5bb8 ("perf stat: Add interval printing")
Signed-off-by: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Acked-by: Jiri Olsa <jolsa(a)kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
Tested-by: Ravi Bangoria <ravi.bangoria(a)linux.ibm.com>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
Cc: Stephane Eranian <eranian(a)google.com>
Cc: stable(a)vger.kernel.org # v3.9+
Link: http://lore.kernel.org/lkml/20190904094738.9558-2-srikar@linux.vnet.ibm.com
[ Fixed up conflicts with libperf, i.e. some perf_{evsel,evlist} lost the 'perf' prefix ]
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index eece3d1e429a..fa4b148ecfca 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1952,6 +1952,9 @@ int cmd_stat(int argc, const char **argv)
fprintf(output, "[ perf stat: executing run #%d ... ]\n",
run_idx + 1);
+ if (run_idx != 0)
+ perf_evlist__reset_prev_raw_counts(evsel_list);
+
status = run_perf_stat(argc, argv, run_idx);
if (forever && status != -1) {
print_counters(NULL, argc, argv);
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 06571209cb0b..fcd54342c04c 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -162,6 +162,15 @@ static void perf_evsel__free_prev_raw_counts(struct evsel *evsel)
evsel->prev_raw_counts = NULL;
}
+static void perf_evsel__reset_prev_raw_counts(struct evsel *evsel)
+{
+ if (evsel->prev_raw_counts) {
+ evsel->prev_raw_counts->aggr.val = 0;
+ evsel->prev_raw_counts->aggr.ena = 0;
+ evsel->prev_raw_counts->aggr.run = 0;
+ }
+}
+
static int perf_evsel__alloc_stats(struct evsel *evsel, bool alloc_raw)
{
int ncpus = perf_evsel__nr_cpus(evsel);
@@ -212,6 +221,14 @@ void perf_evlist__reset_stats(struct evlist *evlist)
}
}
+void perf_evlist__reset_prev_raw_counts(struct evlist *evlist)
+{
+ struct evsel *evsel;
+
+ evlist__for_each_entry(evlist, evsel)
+ perf_evsel__reset_prev_raw_counts(evsel);
+}
+
static void zero_per_pkg(struct evsel *counter)
{
if (counter->per_pkg_mask)
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 0f9c9f6e2041..edbeb2f63e8d 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -193,6 +193,7 @@ void perf_stat__collect_metric_expr(struct evlist *);
int perf_evlist__alloc_stats(struct evlist *evlist, bool alloc_raw);
void perf_evlist__free_stats(struct evlist *evlist);
void perf_evlist__reset_stats(struct evlist *evlist);
+void perf_evlist__reset_prev_raw_counts(struct evlist *evlist);
int perf_stat_process_counter(struct perf_stat_config *config,
struct evsel *counter);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b63fd11cced17fcb8e133def29001b0f6aaa5e06 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Date: Wed, 4 Sep 2019 15:17:37 +0530
Subject: [PATCH] perf stat: Reset previous counts on repeat with interval
When using 'perf stat' with repeat and interval option, it shows wrong
values for events.
The wrong values will be shown for the first interval on the second and
subsequent repetitions.
Without the fix:
# perf stat -r 3 -I 2000 -e faults -e sched:sched_switch -a sleep 5
2.000282489 53 faults
2.000282489 513 sched:sched_switch
4.005478208 3,721 faults
4.005478208 2,666 sched:sched_switch
5.025470933 395 faults
5.025470933 1,307 sched:sched_switch
2.009602825 1,84,46,74,40,73,70,95,47,520 faults <------
2.009602825 1,84,46,74,40,73,70,95,49,568 sched:sched_switch <------
4.019612206 4,730 faults
4.019612206 2,746 sched:sched_switch
5.039615484 3,953 faults
5.039615484 1,496 sched:sched_switch
2.000274620 1,84,46,74,40,73,70,95,47,520 faults <------
2.000274620 1,84,46,74,40,73,70,95,47,520 sched:sched_switch <------
4.000480342 4,282 faults
4.000480342 2,303 sched:sched_switch
5.000916811 1,322 faults
5.000916811 1,064 sched:sched_switch
#
prev_raw_counts is allocated when using intervals. This is used when
calculating the difference in the counts of events when using interval.
The current counts are stored in prev_raw_counts to calculate the
differences in the next iteration.
On the first interval of the second and subsequent repetitions,
prev_raw_counts would be the values stored in the last interval of the
previous repetitions, while the current counts will only be for the
first interval of the current repetition.
Hence there is a possibility of events showing up as big number.
Fix this by resetting prev_raw_counts whenever perf stat repeats the
command.
With the fix:
# perf stat -r 3 -I 2000 -e faults -e sched:sched_switch -a sleep 5
2.019349347 2,597 faults
2.019349347 2,753 sched:sched_switch
4.019577372 3,098 faults
4.019577372 2,532 sched:sched_switch
5.019415481 1,879 faults
5.019415481 1,356 sched:sched_switch
2.000178813 8,468 faults
2.000178813 2,254 sched:sched_switch
4.000404621 7,440 faults
4.000404621 1,266 sched:sched_switch
5.040196079 2,458 faults
5.040196079 556 sched:sched_switch
2.000191939 6,870 faults
2.000191939 1,170 sched:sched_switch
4.000414103 541 faults
4.000414103 902 sched:sched_switch
5.000809863 450 faults
5.000809863 364 sched:sched_switch
#
Committer notes:
This was broken since the cset introducing the --interval feature, i.e.
--repeat + --interval wasn't tested at that point, add the Fixes tag so
that automatic scripts can pick this up.
Fixes: 13370a9b5bb8 ("perf stat: Add interval printing")
Signed-off-by: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Acked-by: Jiri Olsa <jolsa(a)kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
Tested-by: Ravi Bangoria <ravi.bangoria(a)linux.ibm.com>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
Cc: Stephane Eranian <eranian(a)google.com>
Cc: stable(a)vger.kernel.org # v3.9+
Link: http://lore.kernel.org/lkml/20190904094738.9558-2-srikar@linux.vnet.ibm.com
[ Fixed up conflicts with libperf, i.e. some perf_{evsel,evlist} lost the 'perf' prefix ]
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index eece3d1e429a..fa4b148ecfca 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1952,6 +1952,9 @@ int cmd_stat(int argc, const char **argv)
fprintf(output, "[ perf stat: executing run #%d ... ]\n",
run_idx + 1);
+ if (run_idx != 0)
+ perf_evlist__reset_prev_raw_counts(evsel_list);
+
status = run_perf_stat(argc, argv, run_idx);
if (forever && status != -1) {
print_counters(NULL, argc, argv);
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 06571209cb0b..fcd54342c04c 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -162,6 +162,15 @@ static void perf_evsel__free_prev_raw_counts(struct evsel *evsel)
evsel->prev_raw_counts = NULL;
}
+static void perf_evsel__reset_prev_raw_counts(struct evsel *evsel)
+{
+ if (evsel->prev_raw_counts) {
+ evsel->prev_raw_counts->aggr.val = 0;
+ evsel->prev_raw_counts->aggr.ena = 0;
+ evsel->prev_raw_counts->aggr.run = 0;
+ }
+}
+
static int perf_evsel__alloc_stats(struct evsel *evsel, bool alloc_raw)
{
int ncpus = perf_evsel__nr_cpus(evsel);
@@ -212,6 +221,14 @@ void perf_evlist__reset_stats(struct evlist *evlist)
}
}
+void perf_evlist__reset_prev_raw_counts(struct evlist *evlist)
+{
+ struct evsel *evsel;
+
+ evlist__for_each_entry(evlist, evsel)
+ perf_evsel__reset_prev_raw_counts(evsel);
+}
+
static void zero_per_pkg(struct evsel *counter)
{
if (counter->per_pkg_mask)
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 0f9c9f6e2041..edbeb2f63e8d 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -193,6 +193,7 @@ void perf_stat__collect_metric_expr(struct evlist *);
int perf_evlist__alloc_stats(struct evlist *evlist, bool alloc_raw);
void perf_evlist__free_stats(struct evlist *evlist);
void perf_evlist__reset_stats(struct evlist *evlist);
+void perf_evlist__reset_prev_raw_counts(struct evlist *evlist);
int perf_stat_process_counter(struct perf_stat_config *config,
struct evsel *counter);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b63fd11cced17fcb8e133def29001b0f6aaa5e06 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Date: Wed, 4 Sep 2019 15:17:37 +0530
Subject: [PATCH] perf stat: Reset previous counts on repeat with interval
When using 'perf stat' with repeat and interval option, it shows wrong
values for events.
The wrong values will be shown for the first interval on the second and
subsequent repetitions.
Without the fix:
# perf stat -r 3 -I 2000 -e faults -e sched:sched_switch -a sleep 5
2.000282489 53 faults
2.000282489 513 sched:sched_switch
4.005478208 3,721 faults
4.005478208 2,666 sched:sched_switch
5.025470933 395 faults
5.025470933 1,307 sched:sched_switch
2.009602825 1,84,46,74,40,73,70,95,47,520 faults <------
2.009602825 1,84,46,74,40,73,70,95,49,568 sched:sched_switch <------
4.019612206 4,730 faults
4.019612206 2,746 sched:sched_switch
5.039615484 3,953 faults
5.039615484 1,496 sched:sched_switch
2.000274620 1,84,46,74,40,73,70,95,47,520 faults <------
2.000274620 1,84,46,74,40,73,70,95,47,520 sched:sched_switch <------
4.000480342 4,282 faults
4.000480342 2,303 sched:sched_switch
5.000916811 1,322 faults
5.000916811 1,064 sched:sched_switch
#
prev_raw_counts is allocated when using intervals. This is used when
calculating the difference in the counts of events when using interval.
The current counts are stored in prev_raw_counts to calculate the
differences in the next iteration.
On the first interval of the second and subsequent repetitions,
prev_raw_counts would be the values stored in the last interval of the
previous repetitions, while the current counts will only be for the
first interval of the current repetition.
Hence there is a possibility of events showing up as big number.
Fix this by resetting prev_raw_counts whenever perf stat repeats the
command.
With the fix:
# perf stat -r 3 -I 2000 -e faults -e sched:sched_switch -a sleep 5
2.019349347 2,597 faults
2.019349347 2,753 sched:sched_switch
4.019577372 3,098 faults
4.019577372 2,532 sched:sched_switch
5.019415481 1,879 faults
5.019415481 1,356 sched:sched_switch
2.000178813 8,468 faults
2.000178813 2,254 sched:sched_switch
4.000404621 7,440 faults
4.000404621 1,266 sched:sched_switch
5.040196079 2,458 faults
5.040196079 556 sched:sched_switch
2.000191939 6,870 faults
2.000191939 1,170 sched:sched_switch
4.000414103 541 faults
4.000414103 902 sched:sched_switch
5.000809863 450 faults
5.000809863 364 sched:sched_switch
#
Committer notes:
This was broken since the cset introducing the --interval feature, i.e.
--repeat + --interval wasn't tested at that point, add the Fixes tag so
that automatic scripts can pick this up.
Fixes: 13370a9b5bb8 ("perf stat: Add interval printing")
Signed-off-by: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Acked-by: Jiri Olsa <jolsa(a)kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
Tested-by: Ravi Bangoria <ravi.bangoria(a)linux.ibm.com>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
Cc: Stephane Eranian <eranian(a)google.com>
Cc: stable(a)vger.kernel.org # v3.9+
Link: http://lore.kernel.org/lkml/20190904094738.9558-2-srikar@linux.vnet.ibm.com
[ Fixed up conflicts with libperf, i.e. some perf_{evsel,evlist} lost the 'perf' prefix ]
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index eece3d1e429a..fa4b148ecfca 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1952,6 +1952,9 @@ int cmd_stat(int argc, const char **argv)
fprintf(output, "[ perf stat: executing run #%d ... ]\n",
run_idx + 1);
+ if (run_idx != 0)
+ perf_evlist__reset_prev_raw_counts(evsel_list);
+
status = run_perf_stat(argc, argv, run_idx);
if (forever && status != -1) {
print_counters(NULL, argc, argv);
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 06571209cb0b..fcd54342c04c 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -162,6 +162,15 @@ static void perf_evsel__free_prev_raw_counts(struct evsel *evsel)
evsel->prev_raw_counts = NULL;
}
+static void perf_evsel__reset_prev_raw_counts(struct evsel *evsel)
+{
+ if (evsel->prev_raw_counts) {
+ evsel->prev_raw_counts->aggr.val = 0;
+ evsel->prev_raw_counts->aggr.ena = 0;
+ evsel->prev_raw_counts->aggr.run = 0;
+ }
+}
+
static int perf_evsel__alloc_stats(struct evsel *evsel, bool alloc_raw)
{
int ncpus = perf_evsel__nr_cpus(evsel);
@@ -212,6 +221,14 @@ void perf_evlist__reset_stats(struct evlist *evlist)
}
}
+void perf_evlist__reset_prev_raw_counts(struct evlist *evlist)
+{
+ struct evsel *evsel;
+
+ evlist__for_each_entry(evlist, evsel)
+ perf_evsel__reset_prev_raw_counts(evsel);
+}
+
static void zero_per_pkg(struct evsel *counter)
{
if (counter->per_pkg_mask)
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 0f9c9f6e2041..edbeb2f63e8d 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -193,6 +193,7 @@ void perf_stat__collect_metric_expr(struct evlist *);
int perf_evlist__alloc_stats(struct evlist *evlist, bool alloc_raw);
void perf_evlist__free_stats(struct evlist *evlist);
void perf_evlist__reset_stats(struct evlist *evlist);
+void perf_evlist__reset_prev_raw_counts(struct evlist *evlist);
int perf_stat_process_counter(struct perf_stat_config *config,
struct evsel *counter);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 443f2d5ba13d65ccfd879460f77941875159d154 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Date: Wed, 4 Sep 2019 15:17:38 +0530
Subject: [PATCH] perf stat: Fix a segmentation fault when using repeat forever
Observe a segmentation fault when 'perf stat' is asked to repeat forever
with the interval option.
Without fix:
# perf stat -r 0 -I 5000 -e cycles -a sleep 10
# time counts unit events
5.000211692 3,13,89,82,34,157 cycles
10.000380119 1,53,98,52,22,294 cycles
10.040467280 17,16,79,265 cycles
Segmentation fault
This problem was only observed when we use forever option aka -r 0 and
works with limited repeats. Calling print_counter with ts being set to
NULL, is not a correct option when interval is set. Hence avoid
print_counter(NULL,..) if interval is set.
With fix:
# perf stat -r 0 -I 5000 -e cycles -a sleep 10
# time counts unit events
5.019866622 3,15,14,43,08,697 cycles
10.039865756 3,15,16,31,95,261 cycles
10.059950628 1,26,05,47,158 cycles
5.009902655 3,14,52,62,33,932 cycles
10.019880228 3,14,52,22,89,154 cycles
10.030543876 66,90,18,333 cycles
5.009848281 3,14,51,98,25,437 cycles
10.029854402 3,15,14,93,04,918 cycles
5.009834177 3,14,51,95,92,316 cycles
Committer notes:
Did the 'git bisect' to find the cset introducing the problem to add the
Fixes tag below, and at that time the problem reproduced as:
(gdb) run stat -r0 -I500 sleep 1
<SNIP>
Program received signal SIGSEGV, Segmentation fault.
print_interval (prefix=prefix@entry=0x7fffffffc8d0 "", ts=ts@entry=0x0) at builtin-stat.c:866
866 sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, csv_sep);
(gdb) bt
#0 print_interval (prefix=prefix@entry=0x7fffffffc8d0 "", ts=ts@entry=0x0) at builtin-stat.c:866
#1 0x000000000041860a in print_counters (ts=ts@entry=0x0, argc=argc@entry=2, argv=argv@entry=0x7fffffffd640) at builtin-stat.c:938
#2 0x0000000000419a7f in cmd_stat (argc=2, argv=0x7fffffffd640, prefix=<optimized out>) at builtin-stat.c:1411
#3 0x000000000045c65a in run_builtin (p=p@entry=0x6291b8 <commands+216>, argc=argc@entry=5, argv=argv@entry=0x7fffffffd640) at perf.c:370
#4 0x000000000045c893 in handle_internal_command (argc=5, argv=0x7fffffffd640) at perf.c:429
#5 0x000000000045c8f1 in run_argv (argcp=argcp@entry=0x7fffffffd4ac, argv=argv@entry=0x7fffffffd4a0) at perf.c:473
#6 0x000000000045cac9 in main (argc=<optimized out>, argv=<optimized out>) at perf.c:588
(gdb)
Mostly the same as just before this patch:
Program received signal SIGSEGV, Segmentation fault.
0x00000000005874a7 in print_interval (config=0xa1f2a0 <stat_config>, evlist=0xbc9b90, prefix=0x7fffffffd1c0 "`", ts=0x0) at util/stat-display.c:964
964 sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, config->csv_sep);
(gdb) bt
#0 0x00000000005874a7 in print_interval (config=0xa1f2a0 <stat_config>, evlist=0xbc9b90, prefix=0x7fffffffd1c0 "`", ts=0x0) at util/stat-display.c:964
#1 0x0000000000588047 in perf_evlist__print_counters (evlist=0xbc9b90, config=0xa1f2a0 <stat_config>, _target=0xa1f0c0 <target>, ts=0x0, argc=2, argv=0x7fffffffd670)
at util/stat-display.c:1172
#2 0x000000000045390f in print_counters (ts=0x0, argc=2, argv=0x7fffffffd670) at builtin-stat.c:656
#3 0x0000000000456bb5 in cmd_stat (argc=2, argv=0x7fffffffd670) at builtin-stat.c:1960
#4 0x00000000004dd2e0 in run_builtin (p=0xa30e00 <commands+288>, argc=5, argv=0x7fffffffd670) at perf.c:310
#5 0x00000000004dd54d in handle_internal_command (argc=5, argv=0x7fffffffd670) at perf.c:362
#6 0x00000000004dd694 in run_argv (argcp=0x7fffffffd4cc, argv=0x7fffffffd4c0) at perf.c:406
#7 0x00000000004dda11 in main (argc=5, argv=0x7fffffffd670) at perf.c:531
(gdb)
Fixes: d4f63a4741a8 ("perf stat: Introduce print_counters function")
Signed-off-by: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Acked-by: Jiri Olsa <jolsa(a)kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
Tested-by: Ravi Bangoria <ravi.bangoria(a)linux.ibm.com>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
Cc: stable(a)vger.kernel.org # v4.2+
Link: http://lore.kernel.org/lkml/20190904094738.9558-3-srikar@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index fa4b148ecfca..60cdd383af81 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1956,7 +1956,7 @@ int cmd_stat(int argc, const char **argv)
perf_evlist__reset_prev_raw_counts(evsel_list);
status = run_perf_stat(argc, argv, run_idx);
- if (forever && status != -1) {
+ if (forever && status != -1 && !interval) {
print_counters(NULL, argc, argv);
perf_stat__reset_stats();
}
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 443f2d5ba13d65ccfd879460f77941875159d154 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Date: Wed, 4 Sep 2019 15:17:38 +0530
Subject: [PATCH] perf stat: Fix a segmentation fault when using repeat forever
Observe a segmentation fault when 'perf stat' is asked to repeat forever
with the interval option.
Without fix:
# perf stat -r 0 -I 5000 -e cycles -a sleep 10
# time counts unit events
5.000211692 3,13,89,82,34,157 cycles
10.000380119 1,53,98,52,22,294 cycles
10.040467280 17,16,79,265 cycles
Segmentation fault
This problem was only observed when we use forever option aka -r 0 and
works with limited repeats. Calling print_counter with ts being set to
NULL, is not a correct option when interval is set. Hence avoid
print_counter(NULL,..) if interval is set.
With fix:
# perf stat -r 0 -I 5000 -e cycles -a sleep 10
# time counts unit events
5.019866622 3,15,14,43,08,697 cycles
10.039865756 3,15,16,31,95,261 cycles
10.059950628 1,26,05,47,158 cycles
5.009902655 3,14,52,62,33,932 cycles
10.019880228 3,14,52,22,89,154 cycles
10.030543876 66,90,18,333 cycles
5.009848281 3,14,51,98,25,437 cycles
10.029854402 3,15,14,93,04,918 cycles
5.009834177 3,14,51,95,92,316 cycles
Committer notes:
Did the 'git bisect' to find the cset introducing the problem to add the
Fixes tag below, and at that time the problem reproduced as:
(gdb) run stat -r0 -I500 sleep 1
<SNIP>
Program received signal SIGSEGV, Segmentation fault.
print_interval (prefix=prefix@entry=0x7fffffffc8d0 "", ts=ts@entry=0x0) at builtin-stat.c:866
866 sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, csv_sep);
(gdb) bt
#0 print_interval (prefix=prefix@entry=0x7fffffffc8d0 "", ts=ts@entry=0x0) at builtin-stat.c:866
#1 0x000000000041860a in print_counters (ts=ts@entry=0x0, argc=argc@entry=2, argv=argv@entry=0x7fffffffd640) at builtin-stat.c:938
#2 0x0000000000419a7f in cmd_stat (argc=2, argv=0x7fffffffd640, prefix=<optimized out>) at builtin-stat.c:1411
#3 0x000000000045c65a in run_builtin (p=p@entry=0x6291b8 <commands+216>, argc=argc@entry=5, argv=argv@entry=0x7fffffffd640) at perf.c:370
#4 0x000000000045c893 in handle_internal_command (argc=5, argv=0x7fffffffd640) at perf.c:429
#5 0x000000000045c8f1 in run_argv (argcp=argcp@entry=0x7fffffffd4ac, argv=argv@entry=0x7fffffffd4a0) at perf.c:473
#6 0x000000000045cac9 in main (argc=<optimized out>, argv=<optimized out>) at perf.c:588
(gdb)
Mostly the same as just before this patch:
Program received signal SIGSEGV, Segmentation fault.
0x00000000005874a7 in print_interval (config=0xa1f2a0 <stat_config>, evlist=0xbc9b90, prefix=0x7fffffffd1c0 "`", ts=0x0) at util/stat-display.c:964
964 sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, config->csv_sep);
(gdb) bt
#0 0x00000000005874a7 in print_interval (config=0xa1f2a0 <stat_config>, evlist=0xbc9b90, prefix=0x7fffffffd1c0 "`", ts=0x0) at util/stat-display.c:964
#1 0x0000000000588047 in perf_evlist__print_counters (evlist=0xbc9b90, config=0xa1f2a0 <stat_config>, _target=0xa1f0c0 <target>, ts=0x0, argc=2, argv=0x7fffffffd670)
at util/stat-display.c:1172
#2 0x000000000045390f in print_counters (ts=0x0, argc=2, argv=0x7fffffffd670) at builtin-stat.c:656
#3 0x0000000000456bb5 in cmd_stat (argc=2, argv=0x7fffffffd670) at builtin-stat.c:1960
#4 0x00000000004dd2e0 in run_builtin (p=0xa30e00 <commands+288>, argc=5, argv=0x7fffffffd670) at perf.c:310
#5 0x00000000004dd54d in handle_internal_command (argc=5, argv=0x7fffffffd670) at perf.c:362
#6 0x00000000004dd694 in run_argv (argcp=0x7fffffffd4cc, argv=0x7fffffffd4c0) at perf.c:406
#7 0x00000000004dda11 in main (argc=5, argv=0x7fffffffd670) at perf.c:531
(gdb)
Fixes: d4f63a4741a8 ("perf stat: Introduce print_counters function")
Signed-off-by: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Acked-by: Jiri Olsa <jolsa(a)kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
Tested-by: Ravi Bangoria <ravi.bangoria(a)linux.ibm.com>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
Cc: stable(a)vger.kernel.org # v4.2+
Link: http://lore.kernel.org/lkml/20190904094738.9558-3-srikar@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index fa4b148ecfca..60cdd383af81 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1956,7 +1956,7 @@ int cmd_stat(int argc, const char **argv)
perf_evlist__reset_prev_raw_counts(evsel_list);
status = run_perf_stat(argc, argv, run_idx);
- if (forever && status != -1) {
+ if (forever && status != -1 && !interval) {
print_counters(NULL, argc, argv);
perf_stat__reset_stats();
}
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0216234c2eed1367a318daeb9f4a97d8217412a0 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa(a)kernel.org>
Date: Thu, 12 Sep 2019 12:52:35 +0200
Subject: [PATCH] perf tools: Fix segfault in cpu_cache_level__read()
We release wrong pointer on error path in cpu_cache_level__read
function, leading to segfault:
(gdb) r record ls
Starting program: /root/perf/tools/perf/perf record ls
...
[ perf record: Woken up 1 times to write data ]
double free or corruption (out)
Thread 1 "perf" received signal SIGABRT, Aborted.
0x00007ffff7463798 in raise () from /lib64/power9/libc.so.6
(gdb) bt
#0 0x00007ffff7463798 in raise () from /lib64/power9/libc.so.6
#1 0x00007ffff7443bac in abort () from /lib64/power9/libc.so.6
#2 0x00007ffff74af8bc in __libc_message () from /lib64/power9/libc.so.6
#3 0x00007ffff74b92b8 in malloc_printerr () from /lib64/power9/libc.so.6
#4 0x00007ffff74bb874 in _int_free () from /lib64/power9/libc.so.6
#5 0x0000000010271260 in __zfree (ptr=0x7fffffffa0b0) at ../../lib/zalloc..
#6 0x0000000010139340 in cpu_cache_level__read (cache=0x7fffffffa090, cac..
#7 0x0000000010143c90 in build_caches (cntp=0x7fffffffa118, size=<optimiz..
...
Releasing the proper pointer.
Fixes: 720e98b5faf1 ("perf tools: Add perf data cache feature")
Signed-off-by: Jiri Olsa <jolsa(a)kernel.org>
Cc: Alexander Shishkin <alexander.shishkin(a)linux.intel.com>
Cc: Michael Petlan <mpetlan(a)redhat.com>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: stable(a)vger.kernel.org: # v4.6+
Link: http://lore.kernel.org/lkml/20190912105235.10689-1-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 5722ff717777..0167f9697172 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1073,7 +1073,7 @@ static int cpu_cache_level__read(struct cpu_cache_level *cache, u32 cpu, u16 lev
scnprintf(file, PATH_MAX, "%s/shared_cpu_list", path);
if (sysfs__read_str(file, &cache->map, &len)) {
- zfree(&cache->map);
+ zfree(&cache->size);
zfree(&cache->type);
return -1;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0216234c2eed1367a318daeb9f4a97d8217412a0 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa(a)kernel.org>
Date: Thu, 12 Sep 2019 12:52:35 +0200
Subject: [PATCH] perf tools: Fix segfault in cpu_cache_level__read()
We release wrong pointer on error path in cpu_cache_level__read
function, leading to segfault:
(gdb) r record ls
Starting program: /root/perf/tools/perf/perf record ls
...
[ perf record: Woken up 1 times to write data ]
double free or corruption (out)
Thread 1 "perf" received signal SIGABRT, Aborted.
0x00007ffff7463798 in raise () from /lib64/power9/libc.so.6
(gdb) bt
#0 0x00007ffff7463798 in raise () from /lib64/power9/libc.so.6
#1 0x00007ffff7443bac in abort () from /lib64/power9/libc.so.6
#2 0x00007ffff74af8bc in __libc_message () from /lib64/power9/libc.so.6
#3 0x00007ffff74b92b8 in malloc_printerr () from /lib64/power9/libc.so.6
#4 0x00007ffff74bb874 in _int_free () from /lib64/power9/libc.so.6
#5 0x0000000010271260 in __zfree (ptr=0x7fffffffa0b0) at ../../lib/zalloc..
#6 0x0000000010139340 in cpu_cache_level__read (cache=0x7fffffffa090, cac..
#7 0x0000000010143c90 in build_caches (cntp=0x7fffffffa118, size=<optimiz..
...
Releasing the proper pointer.
Fixes: 720e98b5faf1 ("perf tools: Add perf data cache feature")
Signed-off-by: Jiri Olsa <jolsa(a)kernel.org>
Cc: Alexander Shishkin <alexander.shishkin(a)linux.intel.com>
Cc: Michael Petlan <mpetlan(a)redhat.com>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: stable(a)vger.kernel.org: # v4.6+
Link: http://lore.kernel.org/lkml/20190912105235.10689-1-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 5722ff717777..0167f9697172 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1073,7 +1073,7 @@ static int cpu_cache_level__read(struct cpu_cache_level *cache, u32 cpu, u16 lev
scnprintf(file, PATH_MAX, "%s/shared_cpu_list", path);
if (sysfs__read_str(file, &cache->map, &len)) {
- zfree(&cache->map);
+ zfree(&cache->size);
zfree(&cache->type);
return -1;
}
From: Dave Chinner <dchinner(a)redhat.com>
commit c9fbd7bbc23dbdd73364be4d045e5d3612cf6e82 upstream.
We recently had an oops reported on a 4.14 kernel in
xfs_reclaim_inodes_count() where sb->s_fs_info pointed to garbage
and so the m_perag_tree lookup walked into lala land.
Essentially, the machine was under memory pressure when the mount
was being run, xfs_fs_fill_super() failed after allocating the
xfs_mount and attaching it to sb->s_fs_info. It then cleaned up and
freed the xfs_mount, but the sb->s_fs_info field still pointed to
the freed memory. Hence when the superblock shrinker then ran
it fell off the bad pointer.
With the superblock shrinker problem fixed at teh VFS level, this
stale s_fs_info pointer is still a problem - we use it
unconditionally in ->put_super when the superblock is being torn
down, and hence we can still trip over it after a ->fill_super
call failure. Hence we need to clear s_fs_info if
xfs-fs_fill_super() fails, and we need to check if it's valid in
the places it can potentially be dereferenced after a ->fill_super
failure.
Signed-Off-By: Dave Chinner <dchinner(a)redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong(a)oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong(a)oracle.com>
Signed-off-by: Ajay Kaher <akaher(a)vmware.com>
---
fs/xfs/xfs_super.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ef64a1e..ff3f581 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1572,6 +1572,7 @@ xfs_fs_fill_super(
out_close_devices:
xfs_close_devices(mp);
out_free_fsname:
+ sb->s_fs_info = NULL;
xfs_free_fsname(mp);
kfree(mp);
out:
@@ -1589,6 +1590,10 @@ xfs_fs_put_super(
{
struct xfs_mount *mp = XFS_M(sb);
+ /* if ->fill_super failed, we have no mount to tear down */
+ if (!sb->s_fs_info)
+ return;
+
xfs_notice(mp, "Unmounting Filesystem");
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
@@ -1598,6 +1603,8 @@ xfs_fs_put_super(
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
xfs_close_devices(mp);
+
+ sb->s_fs_info = NULL;
xfs_free_fsname(mp);
kfree(mp);
}
@@ -1617,6 +1624,9 @@ xfs_fs_nr_cached_objects(
struct super_block *sb,
struct shrink_control *sc)
{
+ /* Paranoia: catch incorrect calls during mount setup or teardown */
+ if (WARN_ON_ONCE(!sb->s_fs_info))
+ return 0;
return xfs_reclaim_inodes_count(XFS_M(sb));
}
--
2.7.4
Florian and Dave reported [1] a NULL pointer dereference in
__reset_isolation_pfn(). While the exact cause is unclear, staring at the code
revealed two bugs, which might be related.
One bug is that if zone starts in the middle of pageblock, block_page might
correspond to different pfn than block_pfn, and then the pfn_valid_within()
checks will check different pfn's than those accessed via struct page. This
might result in acessing an unitialized page in CONFIG_HOLES_IN_ZONE configs.
The other bug is that end_page refers to the first page of next pageblock and
not last page of current pageblock. The online and valid check is then wrong
and with sections, the while (page < end_page) loop might wander off actual
struct page arrays.
[1] https://lore.kernel.org/linux-xfs/87o8z1fvqu.fsf@mid.deneb.enyo.de/
Reported-by: Florian Weimer <fw(a)deneb.enyo.de>
Reported-by: Dave Chinner <david(a)fromorbit.com>
Fixes: 6b0868c820ff ("mm/compaction.c: correct zone boundary handling when resetting pageblock skip hints")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz>
---
mm/compaction.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/mm/compaction.c b/mm/compaction.c
index ce08b39d85d4..672d3c78c6ab 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -270,14 +270,15 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
/* Ensure the start of the pageblock or zone is online and valid */
block_pfn = pageblock_start_pfn(pfn);
- block_page = pfn_to_online_page(max(block_pfn, zone->zone_start_pfn));
+ block_pfn = max(block_pfn, zone->zone_start_pfn);
+ block_page = pfn_to_online_page(block_pfn);
if (block_page) {
page = block_page;
pfn = block_pfn;
}
/* Ensure the end of the pageblock or zone is online and valid */
- block_pfn += pageblock_nr_pages;
+ block_pfn = pageblock_end_pfn(pfn) - 1;
block_pfn = min(block_pfn, zone_end_pfn(zone) - 1);
end_page = pfn_to_online_page(block_pfn);
if (!end_page)
@@ -303,7 +304,7 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
page += (1 << PAGE_ALLOC_COSTLY_ORDER);
pfn += (1 << PAGE_ALLOC_COSTLY_ORDER);
- } while (page < end_page);
+ } while (page <= end_page);
return false;
}
--
2.23.0
This is a rare corner case, but it does happen:
In perf_rotate_context(), when the first cpu flexible event fail to
schedule, cpu_rotate is 1, while cpu_event is NULL. Since cpu_event is
NULL, perf_rotate_context will _NOT_ call cpu_ctx_sched_out(), thus
cpuctx->ctx.is_active will have EVENT_FLEXIBLE set. Then, the next
perf_event_sched_in() will skip all cpu flexible events because of the
EVENT_FLEXIBLE bit.
In the next call of perf_rotate_context(), cpu_rotate stays 1, and
cpu_event stays NULL, so this process repeats. The end result is, flexible
events on this cpu will not be scheduled (until another event being added
to the cpuctx).
Similar issue may happen with the task_ctx. But it is usually not a
problem because the task_ctx moves around different CPU.
Fix this corner case by using cpu_rotate and task_rotate to gate calls for
(cpu_)ctx_sched_out and rotate_ctx. Also enable rotate_ctx() to handle
event == NULL case.
Fixes: 8d5bce0c37fa ("perf/core: Optimize perf_rotate_context() event scheduling")
Cc: stable(a)vger.kernel.org # v4.17+
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Arnaldo Carvalho de Melo <acme(a)kernel.org>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Song Liu <songliubraving(a)fb.com>
---
kernel/events/core.c | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4655adbbae10..50021735f367 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3775,6 +3775,13 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
if (ctx->rotate_disable)
return;
+ /* if no event specified, try to rotate the first event */
+ if (!event)
+ event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
+ typeof(*event), group_node);
+ if (!event)
+ return;
+
perf_event_groups_delete(&ctx->flexible_groups, event);
perf_event_groups_insert(&ctx->flexible_groups, event);
}
@@ -3816,14 +3823,14 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
* As per the order given at ctx_resched() first 'pop' task flexible
* and then, if needed CPU flexible.
*/
- if (task_event || (task_ctx && cpu_event))
+ if (task_rotate || (task_ctx && cpu_rotate))
ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
- if (cpu_event)
+ if (cpu_rotate)
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- if (task_event)
+ if (task_rotate)
rotate_ctx(task_ctx, task_event);
- if (cpu_event)
+ if (cpu_rotate)
rotate_ctx(&cpuctx->ctx, cpu_event);
perf_event_sched_in(cpuctx, task_ctx, current);
--
2.17.1
From: Sean Paul <seanpaul(a)chromium.org>
Since the dirtyfb ioctl doesn't give us any hints as to which plane is
scanning out the fb it's marking as damaged, we need to loop through
planes to find it.
Currently we just reach into plane state and check, but that can race
with another commit changing the fb out from under us. This patch locks
the plane before checking the fb and will release the lock if the plane
is not displaying the dirty fb.
Fixes: b9fc5e01d1ce ("drm: Add helper to implement legacy dirtyfb")
Cc: Rob Clark <robdclark(a)gmail.com>
Cc: Deepak Rawat <drawat(a)vmware.com>
Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Cc: Thomas Hellstrom <thellstrom(a)vmware.com>
Cc: Maarten Lankhorst <maarten.lankhorst(a)linux.intel.com>
Cc: Maxime Ripard <maxime.ripard(a)bootlin.com>
Cc: Sean Paul <sean(a)poorly.run>
Cc: David Airlie <airlied(a)linux.ie>
Cc: Daniel Vetter <daniel(a)ffwll.ch>
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v5.0+
Reported-by: Daniel Vetter <daniel(a)ffwll.ch>
Signed-off-by: Sean Paul <seanpaul(a)chromium.org>
---
drivers/gpu/drm/drm_damage_helper.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/drm_damage_helper.c b/drivers/gpu/drm/drm_damage_helper.c
index 8230dac01a89..3a4126dc2520 100644
--- a/drivers/gpu/drm/drm_damage_helper.c
+++ b/drivers/gpu/drm/drm_damage_helper.c
@@ -212,8 +212,14 @@ int drm_atomic_helper_dirtyfb(struct drm_framebuffer *fb,
drm_for_each_plane(plane, fb->dev) {
struct drm_plane_state *plane_state;
- if (plane->state->fb != fb)
+ ret = drm_modeset_lock(&plane->mutex, state->acquire_ctx);
+ if (ret)
+ goto out;
+
+ if (plane->state->fb != fb) {
+ drm_modeset_unlock(&plane->mutex);
continue;
+ }
plane_state = drm_atomic_get_plane_state(state, plane);
if (IS_ERR(plane_state)) {
--
Sean Paul, Software Engineer, Google / Chromium OS
Hello,
We ran automated tests on a patchset that was proposed for merging into this
kernel tree. The patches were applied to:
Kernel repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
Commit: 52020d3f6633 - Linux 5.3.5
The results of these automated tests are provided below.
Overall result: PASSED
Merge: OK
Compile: OK
Tests: OK
All kernel binaries, config files, and logs are available for download here:
https://artifacts.cki-project.org/pipelines/213190
Please reply to this email if you have any questions about the tests that we
ran or if you have any suggestions on how to make future tests more effective.
,-. ,-.
( C ) ( K ) Continuous
`-',-.`-' Kernel
( I ) Integration
`-'
______________________________________________________________________________
Merge testing
-------------
We cloned this repository and checked out the following commit:
Repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
Commit: 52020d3f6633 - Linux 5.3.5
We grabbed the da4f76bfb1e4 commit of the stable queue repository.
We then merged the patchset with `git am`:
s390-process-avoid-potential-reading-of-freed-stack.patch
s390-sclp-fix-bit-checked-for-has_sipl.patch
kvm-s390-test-for-bad-access-register-and-size-at-the-start-of-s390_mem_op.patch
s390-topology-avoid-firing-events-before-kobjs-are-created.patch
s390-cio-avoid-calling-strlen-on-null-pointer.patch
s390-cio-exclude-subchannels-with-no-parent-from-pseudo-check.patch
s390-dasd-fix-error-handling-during-online-processing.patch
revert-s390-dasd-add-discard-support-for-ese-volumes.patch
kvm-s390-fix-__insn32_query-inline-assembly.patch
kvm-ppc-book3s-enable-xive-native-capability-only-if-opal-has-required-functions.patch
kvm-ppc-book3s-hv-xive-free-escalation-interrupts-before-disabling-the-vp.patch
kvm-ppc-book3s-hv-don-t-push-xive-context-when-not-using-xive-device.patch
kvm-ppc-book3s-hv-fix-race-in-re-enabling-xive-escalation-interrupts.patch
kvm-ppc-book3s-hv-check-for-mmu-ready-on-piggybacked-virtual-cores.patch
kvm-ppc-book3s-hv-don-t-lose-pending-doorbell-request-on-migration-on-p9.patch
kvm-x86-fix-userspace-set-invalid-cr4.patch
nbd-fix-max-number-of-supported-devs.patch
pm-devfreq-tegra-fix-khz-to-hz-conversion.patch
asoc-define-a-set-of-dapm-pre-post-up-events.patch
asoc-sgtl5000-improve-vag-power-and-mute-control.patch
powerpc-xive-implement-get_irqchip_state-method-for-xive-to-fix-shutdown-race.patch
powerpc-mce-fix-mce-handling-for-huge-pages.patch
powerpc-mce-schedule-work-from-irq_work.patch
powerpc-603-fix-handling-of-the-dirty-flag.patch
powerpc-32s-fix-boot-failure-with-debug_pagealloc-without-kasan.patch
powerpc-ptdump-fix-addresses-display-on-ppc32.patch
powerpc-powernv-restrict-opal-symbol-map-to-only-be-readable-by-root.patch
powerpc-pseries-fix-cpu_hotplug_lock-acquisition-in-resize_hpt.patch
powerpc-powernv-ioda-fix-race-in-tce-level-allocation.patch
powerpc-kasan-fix-parallel-loading-of-modules.patch
powerpc-kasan-fix-shadow-area-set-up-for-modules.patch
powerpc-book3s64-mm-don-t-do-tlbie-fixup-for-some-hardware-revisions.patch
powerpc-book3s64-radix-rename-cpu_ftr_p9_tlbie_bug-feature-flag.patch
powerpc-mm-add-a-helper-to-select-page_kernel_ro-or-page_readonly.patch
powerpc-mm-fix-an-oops-in-kasan_mmu_init.patch
powerpc-mm-fixup-tlbie-vs-mtpidr-mtlpidr-ordering-issue-on-power9.patch
can-mcp251x-mcp251x_hw_reset-allow-more-time-after-a-reset.patch
tools-lib-traceevent-fix-robust-test-of-do_generate_dynamic_list_file.patch
tools-lib-traceevent-do-not-free-tep-cmdlines-in-add_new_comm-on-failure.patch
crypto-qat-silence-smp_processor_id-warning.patch
crypto-skcipher-unmap-pages-after-an-external-error.patch
crypto-cavium-zip-add-missing-single_release.patch
crypto-caam-qi-fix-error-handling-in-ern-handler.patch
crypto-caam-fix-concurrency-issue-in-givencrypt-descriptor.patch
crypto-ccree-account-for-tee-not-ready-to-report.patch
crypto-ccree-use-the-full-crypt-length-value.patch
mips-treat-loongson-extensions-as-ases.patch
power-supply-sbs-battery-use-correct-flags-field.patch
power-supply-sbs-battery-only-return-health-when-battery-present.patch
tracing-make-sure-variable-reference-alias-has-correct-var_ref_idx.patch
usercopy-avoid-highmem-pfn-warning.patch
timer-read-jiffies-once-when-forwarding-base-clk.patch
Compile testing
---------------
We compiled the kernel for 3 architectures:
aarch64:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
ppc64le:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
x86_64:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
Hardware testing
----------------
We booted each kernel and ran the following tests:
aarch64:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
ppc64le:
Host 1:
✅ Boot test
✅ Podman system integration test (as root)
✅ Podman system integration test (as user)
✅ jvm test suite
✅ AMTU (Abstract Machine Test Utility)
✅ audit: audit testsuite test
✅ httpd: mod_ssl smoke sanity
✅ iotop: sanity
✅ tuned: tune-processes-through-perf
✅ Usex - version 1.9-29
🚧 ✅ LTP lite
🚧 ✅ ALSA PCM loopback test
🚧 ✅ ALSA Control (mixer) Userspace Element test
🚧 ✅ trace: ftrace/tracer
Host 2:
✅ Boot test
✅ selinux-policy: serge-testsuite
🚧 ✅ Storage blktests
x86_64:
Host 1:
✅ Boot test
✅ selinux-policy: serge-testsuite
🚧 ✅ Storage blktests
Host 2:
✅ Boot test
✅ Podman system integration test (as root)
✅ Podman system integration test (as user)
✅ jvm test suite
✅ AMTU (Abstract Machine Test Utility)
✅ audit: audit testsuite test
✅ httpd: mod_ssl smoke sanity
✅ iotop: sanity
✅ tuned: tune-processes-through-perf
✅ pciutils: sanity smoke test
✅ Usex - version 1.9-29
✅ stress: stress-ng
🚧 ✅ LTP lite
🚧 ✅ ALSA PCM loopback test
🚧 ✅ ALSA Control (mixer) Userspace Element test
🚧 ✅ trace: ftrace/tracer
Test sources: https://github.com/CKI-project/tests-beaker
💚 Pull requests are welcome for new tests or improvements to existing tests!
Waived tests
------------
If the test run included waived tests, they are marked with 🚧. Such tests are
executed but their results are not taken into account. Tests are waived when
their results are not reliable enough, e.g. when they're just introduced or are
being fixed.
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 454de1e7d970d6bc567686052329e4814842867c
Gitweb: https://git.kernel.org/tip/454de1e7d970d6bc567686052329e4814842867c
Author: Janakarajan Natarajan <Janakarajan.Natarajan(a)amd.com>
AuthorDate: Mon, 07 Oct 2019 19:00:22
Committer: Ingo Molnar <mingo(a)kernel.org>
CommitterDate: Tue, 08 Oct 2019 13:25:24 +02:00
x86/asm: Fix MWAITX C-state hint value
As per "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
and System Instructions", MWAITX EAX[7:4]+1 specifies the optional hint
of the optimized C-state. For C0 state, EAX[7:4] should be set to 0xf.
Currently, a value of 0xf is set for EAX[3:0] instead of EAX[7:4]. Fix
this by changing MWAITX_DISABLE_CSTATES from 0xf to 0xf0.
This hasn't had any implications so far because setting reserved bits in
EAX is simply ignored by the CPU.
[ bp: Fixup comment in delay_mwaitx() and massage. ]
Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan(a)amd.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Cc: Frederic Weisbecker <frederic(a)kernel.org>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: "x86(a)kernel.org" <x86(a)kernel.org>
Cc: Zhenzhong Duan <zhenzhong.duan(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/20191007190011.4859-1-Janakarajan.Natarajan@amd.c…
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
---
arch/x86/include/asm/mwait.h | 2 +-
arch/x86/lib/delay.c | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index e28f8b7..9d5252c 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -21,7 +21,7 @@
#define MWAIT_ECX_INTERRUPT_BREAK 0x1
#define MWAITX_ECX_TIMER_ENABLE BIT(1)
#define MWAITX_MAX_LOOPS ((u32)-1)
-#define MWAITX_DISABLE_CSTATES 0xf
+#define MWAITX_DISABLE_CSTATES 0xf0
static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index b7375dc..c126571 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -113,8 +113,8 @@ static void delay_mwaitx(unsigned long __loops)
__monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
/*
- * AMD, like Intel, supports the EAX hint and EAX=0xf
- * means, do not enter any deep C-state and we use it
+ * AMD, like Intel's MWAIT version, supports the EAX hint and
+ * EAX=0xf0 means, do not enter any deep C-state and we use it
* here in delay() to minimize wakeup latency.
*/
__mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 454de1e7d970d6bc567686052329e4814842867c
Gitweb: https://git.kernel.org/tip/454de1e7d970d6bc567686052329e4814842867c
Author: Janakarajan Natarajan <Janakarajan.Natarajan(a)amd.com>
AuthorDate: Mon, 07 Oct 2019 19:00:22
Committer: Ingo Molnar <mingo(a)kernel.org>
CommitterDate: Tue, 08 Oct 2019 13:25:24 +02:00
x86/asm: Fix MWAITX C-state hint value
As per "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
and System Instructions", MWAITX EAX[7:4]+1 specifies the optional hint
of the optimized C-state. For C0 state, EAX[7:4] should be set to 0xf.
Currently, a value of 0xf is set for EAX[3:0] instead of EAX[7:4]. Fix
this by changing MWAITX_DISABLE_CSTATES from 0xf to 0xf0.
This hasn't had any implications so far because setting reserved bits in
EAX is simply ignored by the CPU.
[ bp: Fixup comment in delay_mwaitx() and massage. ]
Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan(a)amd.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Cc: Frederic Weisbecker <frederic(a)kernel.org>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: "x86(a)kernel.org" <x86(a)kernel.org>
Cc: Zhenzhong Duan <zhenzhong.duan(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/20191007190011.4859-1-Janakarajan.Natarajan@amd.c…
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
---
arch/x86/include/asm/mwait.h | 2 +-
arch/x86/lib/delay.c | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index e28f8b7..9d5252c 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -21,7 +21,7 @@
#define MWAIT_ECX_INTERRUPT_BREAK 0x1
#define MWAITX_ECX_TIMER_ENABLE BIT(1)
#define MWAITX_MAX_LOOPS ((u32)-1)
-#define MWAITX_DISABLE_CSTATES 0xf
+#define MWAITX_DISABLE_CSTATES 0xf0
static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index b7375dc..c126571 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -113,8 +113,8 @@ static void delay_mwaitx(unsigned long __loops)
__monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
/*
- * AMD, like Intel, supports the EAX hint and EAX=0xf
- * means, do not enter any deep C-state and we use it
+ * AMD, like Intel's MWAIT version, supports the EAX hint and
+ * EAX=0xf0 means, do not enter any deep C-state and we use it
* here in delay() to minimize wakeup latency.
*/
__mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
quota/period ratio is used to ensure a child task group won't get more
bandwidth than the parent task group, and is calculated as:
normalized_cfs_quota() = [(quota_us << 20) / period_us]
If the quota/period ratio was changed during this scaling due to
precision loss, it will cause inconsistency between parent and child
task groups. See below example:
A userspace container manager (kubelet) does three operations:
1) Create a parent cgroup, set quota to 1,000us and period to 10,000us.
2) Create a few children cgroups.
3) Set quota to 1,000us and period to 10,000us on a child cgroup.
These operations are expected to succeed. However, if the scaling of
147/128 happens before step 3), quota and period of the parent cgroup
will be changed:
new_quota: 1148437ns, 1148us
new_period: 11484375ns, 11484us
And when step 3) comes in, the ratio of the child cgroup will be 104857,
which will be larger than the parent cgroup ratio (104821), and will
fail.
Scaling them by a factor of 2 will fix the problem.
Fixes: 2e8e19226398 ("sched/fair: Limit sched_cfs_period_timer() loop to avoid hard lockup")
Signed-off-by: Xuewei Zhang <xueweiz(a)google.com>
---
kernel/sched/fair.c | 36 ++++++++++++++++++++++--------------
1 file changed, 22 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 83ab35e2374f..b3d3d0a231cd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4926,20 +4926,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
- new = (old * 147) / 128; /* ~115% */
- new = min(new, max_cfs_quota_period);
-
- cfs_b->period = ns_to_ktime(new);
-
- /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
- cfs_b->quota *= new;
- cfs_b->quota = div64_u64(cfs_b->quota, old);
-
- pr_warn_ratelimited(
- "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
- smp_processor_id(),
- div_u64(new, NSEC_PER_USEC),
- div_u64(cfs_b->quota, NSEC_PER_USEC));
+ /*
+ * Grow period by a factor of 2 to avoid lossing precision.
+ * Precision loss in the quota/period ratio can cause __cfs_schedulable
+ * to fail.
+ */
+ new = old * 2;
+ if (new < max_cfs_quota_period) {
+ cfs_b->period = ns_to_ktime(new);
+ cfs_b->quota *= 2;
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(new, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ } else {
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(old, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ }
/* reset count so we don't come right back in here */
count = 0;
--
2.23.0.581.g78d2f28ef7-goog
Long time ago, there fixed a similar deadlock in show_slab_objects()
[1]. However, it is apparently due to the commits like 01fb58bcba63
("slab: remove synchronous synchronize_sched() from memcg cache
deactivation path") and 03afc0e25f7f ("slab: get_online_mems for
kmem_cache_{create,destroy,shrink}"), this kind of deadlock is back by
just reading files in /sys/kernel/slab which will generate a lockdep
splat below.
Since the "mem_hotplug_lock" here is only to obtain a stable online node
mask while racing with NUMA node hotplug, in the worst case, the results
may me miscalculated while doing NUMA node hotplug, but they shall be
corrected by later reads of the same files.
WARNING: possible circular locking dependency detected
------------------------------------------------------
cat/5224 is trying to acquire lock:
ffff900012ac3120 (mem_hotplug_lock.rw_sem){++++}, at:
show_slab_objects+0x94/0x3a8
but task is already holding lock:
b8ff009693eee398 (kn->count#45){++++}, at: kernfs_seq_start+0x44/0xf0
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (kn->count#45){++++}:
lock_acquire+0x31c/0x360
__kernfs_remove+0x290/0x490
kernfs_remove+0x30/0x44
sysfs_remove_dir+0x70/0x88
kobject_del+0x50/0xb0
sysfs_slab_unlink+0x2c/0x38
shutdown_cache+0xa0/0xf0
kmemcg_cache_shutdown_fn+0x1c/0x34
kmemcg_workfn+0x44/0x64
process_one_work+0x4f4/0x950
worker_thread+0x390/0x4bc
kthread+0x1cc/0x1e8
ret_from_fork+0x10/0x18
-> #1 (slab_mutex){+.+.}:
lock_acquire+0x31c/0x360
__mutex_lock_common+0x16c/0xf78
mutex_lock_nested+0x40/0x50
memcg_create_kmem_cache+0x38/0x16c
memcg_kmem_cache_create_func+0x3c/0x70
process_one_work+0x4f4/0x950
worker_thread+0x390/0x4bc
kthread+0x1cc/0x1e8
ret_from_fork+0x10/0x18
-> #0 (mem_hotplug_lock.rw_sem){++++}:
validate_chain+0xd10/0x2bcc
__lock_acquire+0x7f4/0xb8c
lock_acquire+0x31c/0x360
get_online_mems+0x54/0x150
show_slab_objects+0x94/0x3a8
total_objects_show+0x28/0x34
slab_attr_show+0x38/0x54
sysfs_kf_seq_show+0x198/0x2d4
kernfs_seq_show+0xa4/0xcc
seq_read+0x30c/0x8a8
kernfs_fop_read+0xa8/0x314
__vfs_read+0x88/0x20c
vfs_read+0xd8/0x10c
ksys_read+0xb0/0x120
__arm64_sys_read+0x54/0x88
el0_svc_handler+0x170/0x240
el0_svc+0x8/0xc
other info that might help us debug this:
Chain exists of:
mem_hotplug_lock.rw_sem --> slab_mutex --> kn->count#45
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(kn->count#45);
lock(slab_mutex);
lock(kn->count#45);
lock(mem_hotplug_lock.rw_sem);
*** DEADLOCK ***
3 locks held by cat/5224:
#0: 9eff00095b14b2a0 (&p->lock){+.+.}, at: seq_read+0x4c/0x8a8
#1: 0eff008997041480 (&of->mutex){+.+.}, at: kernfs_seq_start+0x34/0xf0
#2: b8ff009693eee398 (kn->count#45){++++}, at:
kernfs_seq_start+0x44/0xf0
stack backtrace:
Call trace:
dump_backtrace+0x0/0x248
show_stack+0x20/0x2c
dump_stack+0xd0/0x140
print_circular_bug+0x368/0x380
check_noncircular+0x248/0x250
validate_chain+0xd10/0x2bcc
__lock_acquire+0x7f4/0xb8c
lock_acquire+0x31c/0x360
get_online_mems+0x54/0x150
show_slab_objects+0x94/0x3a8
total_objects_show+0x28/0x34
slab_attr_show+0x38/0x54
sysfs_kf_seq_show+0x198/0x2d4
kernfs_seq_show+0xa4/0xcc
seq_read+0x30c/0x8a8
kernfs_fop_read+0xa8/0x314
__vfs_read+0x88/0x20c
vfs_read+0xd8/0x10c
ksys_read+0xb0/0x120
__arm64_sys_read+0x54/0x88
el0_svc_handler+0x170/0x240
el0_svc+0x8/0xc
[1] http://lkml.iu.edu/hypermail/linux/kernel/1101.0/02850.html
Fixes: 01fb58bcba63 ("slab: remove synchronous synchronize_sched() from memcg cache deactivation path")
Fixes: 03afc0e25f7f ("slab: get_online_mems for kmem_cache_{create,destroy,shrink}")
Cc: stable(a)vger.kernel.org
Acked-by: Michal Hocko <mhocko(a)suse.com>
Signed-off-by: Qian Cai <cai(a)lca.pw>
---
v2: fix the comment alignment and improve the changelog.
mm/slub.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/mm/slub.c b/mm/slub.c
index 42c1b3af3c98..86bfd9d98af5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4838,7 +4838,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
}
}
- get_online_mems();
+ /*
+ * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
+ * already held which will conflict with an existing lock order:
+ *
+ * mem_hotplug_lock->slab_mutex->kernfs_mutex
+ */
+
#ifdef CONFIG_SLUB_DEBUG
if (flags & SO_ALL) {
struct kmem_cache_node *n;
@@ -4879,7 +4885,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
x += sprintf(buf + x, " N%d=%lu",
node, nodes[node]);
#endif
- put_online_mems();
kfree(nodes);
return x + sprintf(buf + x, "\n");
}
--
1.8.3.1
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 9b69cab42e5d14b8f0467566e3d97e682365db2d
Gitweb: https://git.kernel.org/tip/9b69cab42e5d14b8f0467566e3d97e682365db2d
Author: Janakarajan Natarajan <Janakarajan.Natarajan(a)amd.com>
AuthorDate: Mon, 07 Oct 2019 19:00:22
Committer: Borislav Petkov <bp(a)suse.de>
CommitterDate: Tue, 08 Oct 2019 09:48:09 +02:00
x86/asm: Fix MWAITX C-state hint value
As per "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
and System Instructions", MWAITX EAX[7:4]+1 specifies the optional hint
of the optimized C-state. For C0 state, EAX[7:4] should be set to 0xf.
Currently, a value of 0xf is set for EAX[3:0] instead of EAX[7:4]. Fix
this by changing MWAITX_DISABLE_CSTATES from 0xf to 0xf0.
This hasn't had any implications so far because setting reserved bits in
EAX is simply ignored by the CPU.
[ bp: Fixup comment in delay_mwaitx() and massage. ]
Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan(a)amd.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Cc: Frederic Weisbecker <frederic(a)kernel.org>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: "x86(a)kernel.org" <x86(a)kernel.org>
Cc: Zhenzhong Duan <zhenzhong.duan(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/20191007190011.4859-1-Janakarajan.Natarajan@amd.c…
---
arch/x86/include/asm/mwait.h | 2 +-
arch/x86/lib/delay.c | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index e28f8b7..9d5252c 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -21,7 +21,7 @@
#define MWAIT_ECX_INTERRUPT_BREAK 0x1
#define MWAITX_ECX_TIMER_ENABLE BIT(1)
#define MWAITX_MAX_LOOPS ((u32)-1)
-#define MWAITX_DISABLE_CSTATES 0xf
+#define MWAITX_DISABLE_CSTATES 0xf0
static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index b7375dc..c126571 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -113,8 +113,8 @@ static void delay_mwaitx(unsigned long __loops)
__monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
/*
- * AMD, like Intel, supports the EAX hint and EAX=0xf
- * means, do not enter any deep C-state and we use it
+ * AMD, like Intel's MWAIT version, supports the EAX hint and
+ * EAX=0xf0 means, do not enter any deep C-state and we use it
* here in delay() to minimize wakeup latency.
*/
__mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 9b69cab42e5d14b8f0467566e3d97e682365db2d
Gitweb: https://git.kernel.org/tip/9b69cab42e5d14b8f0467566e3d97e682365db2d
Author: Janakarajan Natarajan <Janakarajan.Natarajan(a)amd.com>
AuthorDate: Mon, 07 Oct 2019 19:00:22
Committer: Borislav Petkov <bp(a)suse.de>
CommitterDate: Tue, 08 Oct 2019 09:48:09 +02:00
x86/asm: Fix MWAITX C-state hint value
As per "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
and System Instructions", MWAITX EAX[7:4]+1 specifies the optional hint
of the optimized C-state. For C0 state, EAX[7:4] should be set to 0xf.
Currently, a value of 0xf is set for EAX[3:0] instead of EAX[7:4]. Fix
this by changing MWAITX_DISABLE_CSTATES from 0xf to 0xf0.
This hasn't had any implications so far because setting reserved bits in
EAX is simply ignored by the CPU.
[ bp: Fixup comment in delay_mwaitx() and massage. ]
Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan(a)amd.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Cc: Frederic Weisbecker <frederic(a)kernel.org>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: "x86(a)kernel.org" <x86(a)kernel.org>
Cc: Zhenzhong Duan <zhenzhong.duan(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/20191007190011.4859-1-Janakarajan.Natarajan@amd.c…
---
arch/x86/include/asm/mwait.h | 2 +-
arch/x86/lib/delay.c | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index e28f8b7..9d5252c 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -21,7 +21,7 @@
#define MWAIT_ECX_INTERRUPT_BREAK 0x1
#define MWAITX_ECX_TIMER_ENABLE BIT(1)
#define MWAITX_MAX_LOOPS ((u32)-1)
-#define MWAITX_DISABLE_CSTATES 0xf
+#define MWAITX_DISABLE_CSTATES 0xf0
static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index b7375dc..c126571 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -113,8 +113,8 @@ static void delay_mwaitx(unsigned long __loops)
__monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
/*
- * AMD, like Intel, supports the EAX hint and EAX=0xf
- * means, do not enter any deep C-state and we use it
+ * AMD, like Intel's MWAIT version, supports the EAX hint and
+ * EAX=0xf0 means, do not enter any deep C-state and we use it
* here in delay() to minimize wakeup latency.
*/
__mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
udev stored in ep->hcpriv might be NULL if tt buffer is cleared
due to a halted control endpoint during device enumeration
xhci_clear_tt_buffer_complete is called by hub_tt_work() once it's
scheduled, and by then usb core might have freed and allocated a
new udev for the next enumeration attempt.
Fixes: ef513be0a905 ("usb: xhci: Add Clear_TT_Buffer")
Cc: <stable(a)vger.kernel.org> # v5.3
Reported-by: Johan Hovold <johan(a)kernel.org>
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
---
drivers/usb/host/xhci.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 00f3804f7aa7..517ec3206f6e 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -5238,8 +5238,16 @@ static void xhci_clear_tt_buffer_complete(struct usb_hcd *hcd,
unsigned int ep_index;
unsigned long flags;
+ /*
+ * udev might be NULL if tt buffer is cleared during a failed device
+ * enumeration due to a halted control endpoint. Usb core might
+ * have allocated a new udev for the next enumeration attempt.
+ */
+
xhci = hcd_to_xhci(hcd);
udev = (struct usb_device *)ep->hcpriv;
+ if (!udev)
+ return;
slot_id = udev->slot_id;
ep_index = xhci_get_endpoint_index(&ep->desc);
--
2.7.4
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b9023b91dd020ad7e093baa5122b6968c48cc9e0 Mon Sep 17 00:00:00 2001
From: Balasubramani Vivekanandan <balasubramani_vivekanandan(a)mentor.com>
Date: Thu, 26 Sep 2019 15:51:01 +0200
Subject: [PATCH] tick: broadcast-hrtimer: Fix a race in bc_set_next
When a cpu requests broadcasting, before starting the tick broadcast
hrtimer, bc_set_next() checks if the timer callback (bc_handler) is active
using hrtimer_try_to_cancel(). But hrtimer_try_to_cancel() does not provide
the required synchronization when the callback is active on other core.
The callback could have already executed tick_handle_oneshot_broadcast()
and could have also returned. But still there is a small time window where
the hrtimer_try_to_cancel() returns -1. In that case bc_set_next() returns
without doing anything, but the next_event of the tick broadcast clock
device is already set to a timeout value.
In the race condition diagram below, CPU #1 is running the timer callback
and CPU #2 is entering idle state and so calls bc_set_next().
In the worst case, the next_event will contain an expiry time, but the
hrtimer will not be started which happens when the racing callback returns
HRTIMER_NORESTART. The hrtimer might never recover if all further requests
from the CPUs to subscribe to tick broadcast have timeout greater than the
next_event of tick broadcast clock device. This leads to cascading of
failures and finally noticed as rcu stall warnings
Here is a depiction of the race condition
CPU #1 (Running timer callback) CPU #2 (Enter idle
and subscribe to
tick broadcast)
--------------------- ---------------------
__run_hrtimer() tick_broadcast_enter()
bc_handler() __tick_broadcast_oneshot_control()
tick_handle_oneshot_broadcast()
raw_spin_lock(&tick_broadcast_lock);
dev->next_event = KTIME_MAX; //wait for tick_broadcast_lock
//next_event for tick broadcast clock
set to KTIME_MAX since no other cores
subscribed to tick broadcasting
raw_spin_unlock(&tick_broadcast_lock);
if (dev->next_event == KTIME_MAX)
return HRTIMER_NORESTART
// callback function exits without
restarting the hrtimer //tick_broadcast_lock acquired
raw_spin_lock(&tick_broadcast_lock);
tick_broadcast_set_event()
clockevents_program_event()
dev->next_event = expires;
bc_set_next()
hrtimer_try_to_cancel()
//returns -1 since the timer
callback is active. Exits without
restarting the timer
cpu_base->running = NULL;
The comment that hrtimer cannot be armed from within the callback is
wrong. It is fine to start the hrtimer from within the callback. Also it is
safe to start the hrtimer from the enter/exit idle code while the broadcast
handler is active. The enter/exit idle code and the broadcast handler are
synchronized using tick_broadcast_lock. So there is no need for the
existing try to cancel logic. All this can be removed which will eliminate
the race condition as well.
Fixes: 5d1638acb9f6 ("tick: Introduce hrtimer based broadcast")
Originally-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Balasubramani Vivekanandan <balasubramani_vivekanandan(a)mentor.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/20190926135101.12102-2-balasubramani_vivekanandan…
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index c1f5bb590b5e..b5a65e212df2 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -42,39 +42,39 @@ static int bc_shutdown(struct clock_event_device *evt)
*/
static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
{
- int bc_moved;
/*
- * We try to cancel the timer first. If the callback is on
- * flight on some other cpu then we let it handle it. If we
- * were able to cancel the timer nothing can rearm it as we
- * own broadcast_lock.
+ * This is called either from enter/exit idle code or from the
+ * broadcast handler. In all cases tick_broadcast_lock is held.
*
- * However we can also be called from the event handler of
- * ce_broadcast_hrtimer itself when it expires. We cannot
- * restart the timer because we are in the callback, but we
- * can set the expiry time and let the callback return
- * HRTIMER_RESTART.
+ * hrtimer_cancel() cannot be called here neither from the
+ * broadcast handler nor from the enter/exit idle code. The idle
+ * code can run into the problem described in bc_shutdown() and the
+ * broadcast handler cannot wait for itself to complete for obvious
+ * reasons.
*
- * Since we are in the idle loop at this point and because
- * hrtimer_{start/cancel} functions call into tracing,
- * calls to these functions must be bound within RCU_NONIDLE.
+ * Each caller tries to arm the hrtimer on its own CPU, but if the
+ * hrtimer callbback function is currently running, then
+ * hrtimer_start() cannot move it and the timer stays on the CPU on
+ * which it is assigned at the moment.
+ *
+ * As this can be called from idle code, the hrtimer_start()
+ * invocation has to be wrapped with RCU_NONIDLE() as
+ * hrtimer_start() can call into tracing.
*/
- RCU_NONIDLE(
- {
- bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0;
- if (bc_moved) {
- hrtimer_start(&bctimer, expires,
- HRTIMER_MODE_ABS_PINNED_HARD);
- }
- }
- );
-
- if (bc_moved) {
- /* Bind the "device" to the cpu */
- bc->bound_on = smp_processor_id();
- } else if (bc->bound_on == smp_processor_id()) {
- hrtimer_set_expires(&bctimer, expires);
- }
+ RCU_NONIDLE( {
+ hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD);
+ /*
+ * The core tick broadcast mode expects bc->bound_on to be set
+ * correctly to prevent a CPU which has the broadcast hrtimer
+ * armed from going deep idle.
+ *
+ * As tick_broadcast_lock is held, nothing can change the cpu
+ * base which was just established in hrtimer_start() above. So
+ * the below access is safe even without holding the hrtimer
+ * base lock.
+ */
+ bc->bound_on = bctimer.base->cpu_base->cpu;
+ } );
return 0;
}
@@ -100,10 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
{
ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
- if (clockevent_state_oneshot(&ce_broadcast_hrtimer))
- if (ce_broadcast_hrtimer.next_event != KTIME_MAX)
- return HRTIMER_RESTART;
-
return HRTIMER_NORESTART;
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b9023b91dd020ad7e093baa5122b6968c48cc9e0 Mon Sep 17 00:00:00 2001
From: Balasubramani Vivekanandan <balasubramani_vivekanandan(a)mentor.com>
Date: Thu, 26 Sep 2019 15:51:01 +0200
Subject: [PATCH] tick: broadcast-hrtimer: Fix a race in bc_set_next
When a cpu requests broadcasting, before starting the tick broadcast
hrtimer, bc_set_next() checks if the timer callback (bc_handler) is active
using hrtimer_try_to_cancel(). But hrtimer_try_to_cancel() does not provide
the required synchronization when the callback is active on other core.
The callback could have already executed tick_handle_oneshot_broadcast()
and could have also returned. But still there is a small time window where
the hrtimer_try_to_cancel() returns -1. In that case bc_set_next() returns
without doing anything, but the next_event of the tick broadcast clock
device is already set to a timeout value.
In the race condition diagram below, CPU #1 is running the timer callback
and CPU #2 is entering idle state and so calls bc_set_next().
In the worst case, the next_event will contain an expiry time, but the
hrtimer will not be started which happens when the racing callback returns
HRTIMER_NORESTART. The hrtimer might never recover if all further requests
from the CPUs to subscribe to tick broadcast have timeout greater than the
next_event of tick broadcast clock device. This leads to cascading of
failures and finally noticed as rcu stall warnings
Here is a depiction of the race condition
CPU #1 (Running timer callback) CPU #2 (Enter idle
and subscribe to
tick broadcast)
--------------------- ---------------------
__run_hrtimer() tick_broadcast_enter()
bc_handler() __tick_broadcast_oneshot_control()
tick_handle_oneshot_broadcast()
raw_spin_lock(&tick_broadcast_lock);
dev->next_event = KTIME_MAX; //wait for tick_broadcast_lock
//next_event for tick broadcast clock
set to KTIME_MAX since no other cores
subscribed to tick broadcasting
raw_spin_unlock(&tick_broadcast_lock);
if (dev->next_event == KTIME_MAX)
return HRTIMER_NORESTART
// callback function exits without
restarting the hrtimer //tick_broadcast_lock acquired
raw_spin_lock(&tick_broadcast_lock);
tick_broadcast_set_event()
clockevents_program_event()
dev->next_event = expires;
bc_set_next()
hrtimer_try_to_cancel()
//returns -1 since the timer
callback is active. Exits without
restarting the timer
cpu_base->running = NULL;
The comment that hrtimer cannot be armed from within the callback is
wrong. It is fine to start the hrtimer from within the callback. Also it is
safe to start the hrtimer from the enter/exit idle code while the broadcast
handler is active. The enter/exit idle code and the broadcast handler are
synchronized using tick_broadcast_lock. So there is no need for the
existing try to cancel logic. All this can be removed which will eliminate
the race condition as well.
Fixes: 5d1638acb9f6 ("tick: Introduce hrtimer based broadcast")
Originally-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Balasubramani Vivekanandan <balasubramani_vivekanandan(a)mentor.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/20190926135101.12102-2-balasubramani_vivekanandan…
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index c1f5bb590b5e..b5a65e212df2 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -42,39 +42,39 @@ static int bc_shutdown(struct clock_event_device *evt)
*/
static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
{
- int bc_moved;
/*
- * We try to cancel the timer first. If the callback is on
- * flight on some other cpu then we let it handle it. If we
- * were able to cancel the timer nothing can rearm it as we
- * own broadcast_lock.
+ * This is called either from enter/exit idle code or from the
+ * broadcast handler. In all cases tick_broadcast_lock is held.
*
- * However we can also be called from the event handler of
- * ce_broadcast_hrtimer itself when it expires. We cannot
- * restart the timer because we are in the callback, but we
- * can set the expiry time and let the callback return
- * HRTIMER_RESTART.
+ * hrtimer_cancel() cannot be called here neither from the
+ * broadcast handler nor from the enter/exit idle code. The idle
+ * code can run into the problem described in bc_shutdown() and the
+ * broadcast handler cannot wait for itself to complete for obvious
+ * reasons.
*
- * Since we are in the idle loop at this point and because
- * hrtimer_{start/cancel} functions call into tracing,
- * calls to these functions must be bound within RCU_NONIDLE.
+ * Each caller tries to arm the hrtimer on its own CPU, but if the
+ * hrtimer callbback function is currently running, then
+ * hrtimer_start() cannot move it and the timer stays on the CPU on
+ * which it is assigned at the moment.
+ *
+ * As this can be called from idle code, the hrtimer_start()
+ * invocation has to be wrapped with RCU_NONIDLE() as
+ * hrtimer_start() can call into tracing.
*/
- RCU_NONIDLE(
- {
- bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0;
- if (bc_moved) {
- hrtimer_start(&bctimer, expires,
- HRTIMER_MODE_ABS_PINNED_HARD);
- }
- }
- );
-
- if (bc_moved) {
- /* Bind the "device" to the cpu */
- bc->bound_on = smp_processor_id();
- } else if (bc->bound_on == smp_processor_id()) {
- hrtimer_set_expires(&bctimer, expires);
- }
+ RCU_NONIDLE( {
+ hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD);
+ /*
+ * The core tick broadcast mode expects bc->bound_on to be set
+ * correctly to prevent a CPU which has the broadcast hrtimer
+ * armed from going deep idle.
+ *
+ * As tick_broadcast_lock is held, nothing can change the cpu
+ * base which was just established in hrtimer_start() above. So
+ * the below access is safe even without holding the hrtimer
+ * base lock.
+ */
+ bc->bound_on = bctimer.base->cpu_base->cpu;
+ } );
return 0;
}
@@ -100,10 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
{
ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
- if (clockevent_state_oneshot(&ce_broadcast_hrtimer))
- if (ce_broadcast_hrtimer.next_event != KTIME_MAX)
- return HRTIMER_RESTART;
-
return HRTIMER_NORESTART;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b9023b91dd020ad7e093baa5122b6968c48cc9e0 Mon Sep 17 00:00:00 2001
From: Balasubramani Vivekanandan <balasubramani_vivekanandan(a)mentor.com>
Date: Thu, 26 Sep 2019 15:51:01 +0200
Subject: [PATCH] tick: broadcast-hrtimer: Fix a race in bc_set_next
When a cpu requests broadcasting, before starting the tick broadcast
hrtimer, bc_set_next() checks if the timer callback (bc_handler) is active
using hrtimer_try_to_cancel(). But hrtimer_try_to_cancel() does not provide
the required synchronization when the callback is active on other core.
The callback could have already executed tick_handle_oneshot_broadcast()
and could have also returned. But still there is a small time window where
the hrtimer_try_to_cancel() returns -1. In that case bc_set_next() returns
without doing anything, but the next_event of the tick broadcast clock
device is already set to a timeout value.
In the race condition diagram below, CPU #1 is running the timer callback
and CPU #2 is entering idle state and so calls bc_set_next().
In the worst case, the next_event will contain an expiry time, but the
hrtimer will not be started which happens when the racing callback returns
HRTIMER_NORESTART. The hrtimer might never recover if all further requests
from the CPUs to subscribe to tick broadcast have timeout greater than the
next_event of tick broadcast clock device. This leads to cascading of
failures and finally noticed as rcu stall warnings
Here is a depiction of the race condition
CPU #1 (Running timer callback) CPU #2 (Enter idle
and subscribe to
tick broadcast)
--------------------- ---------------------
__run_hrtimer() tick_broadcast_enter()
bc_handler() __tick_broadcast_oneshot_control()
tick_handle_oneshot_broadcast()
raw_spin_lock(&tick_broadcast_lock);
dev->next_event = KTIME_MAX; //wait for tick_broadcast_lock
//next_event for tick broadcast clock
set to KTIME_MAX since no other cores
subscribed to tick broadcasting
raw_spin_unlock(&tick_broadcast_lock);
if (dev->next_event == KTIME_MAX)
return HRTIMER_NORESTART
// callback function exits without
restarting the hrtimer //tick_broadcast_lock acquired
raw_spin_lock(&tick_broadcast_lock);
tick_broadcast_set_event()
clockevents_program_event()
dev->next_event = expires;
bc_set_next()
hrtimer_try_to_cancel()
//returns -1 since the timer
callback is active. Exits without
restarting the timer
cpu_base->running = NULL;
The comment that hrtimer cannot be armed from within the callback is
wrong. It is fine to start the hrtimer from within the callback. Also it is
safe to start the hrtimer from the enter/exit idle code while the broadcast
handler is active. The enter/exit idle code and the broadcast handler are
synchronized using tick_broadcast_lock. So there is no need for the
existing try to cancel logic. All this can be removed which will eliminate
the race condition as well.
Fixes: 5d1638acb9f6 ("tick: Introduce hrtimer based broadcast")
Originally-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Balasubramani Vivekanandan <balasubramani_vivekanandan(a)mentor.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/20190926135101.12102-2-balasubramani_vivekanandan…
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index c1f5bb590b5e..b5a65e212df2 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -42,39 +42,39 @@ static int bc_shutdown(struct clock_event_device *evt)
*/
static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
{
- int bc_moved;
/*
- * We try to cancel the timer first. If the callback is on
- * flight on some other cpu then we let it handle it. If we
- * were able to cancel the timer nothing can rearm it as we
- * own broadcast_lock.
+ * This is called either from enter/exit idle code or from the
+ * broadcast handler. In all cases tick_broadcast_lock is held.
*
- * However we can also be called from the event handler of
- * ce_broadcast_hrtimer itself when it expires. We cannot
- * restart the timer because we are in the callback, but we
- * can set the expiry time and let the callback return
- * HRTIMER_RESTART.
+ * hrtimer_cancel() cannot be called here neither from the
+ * broadcast handler nor from the enter/exit idle code. The idle
+ * code can run into the problem described in bc_shutdown() and the
+ * broadcast handler cannot wait for itself to complete for obvious
+ * reasons.
*
- * Since we are in the idle loop at this point and because
- * hrtimer_{start/cancel} functions call into tracing,
- * calls to these functions must be bound within RCU_NONIDLE.
+ * Each caller tries to arm the hrtimer on its own CPU, but if the
+ * hrtimer callbback function is currently running, then
+ * hrtimer_start() cannot move it and the timer stays on the CPU on
+ * which it is assigned at the moment.
+ *
+ * As this can be called from idle code, the hrtimer_start()
+ * invocation has to be wrapped with RCU_NONIDLE() as
+ * hrtimer_start() can call into tracing.
*/
- RCU_NONIDLE(
- {
- bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0;
- if (bc_moved) {
- hrtimer_start(&bctimer, expires,
- HRTIMER_MODE_ABS_PINNED_HARD);
- }
- }
- );
-
- if (bc_moved) {
- /* Bind the "device" to the cpu */
- bc->bound_on = smp_processor_id();
- } else if (bc->bound_on == smp_processor_id()) {
- hrtimer_set_expires(&bctimer, expires);
- }
+ RCU_NONIDLE( {
+ hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD);
+ /*
+ * The core tick broadcast mode expects bc->bound_on to be set
+ * correctly to prevent a CPU which has the broadcast hrtimer
+ * armed from going deep idle.
+ *
+ * As tick_broadcast_lock is held, nothing can change the cpu
+ * base which was just established in hrtimer_start() above. So
+ * the below access is safe even without holding the hrtimer
+ * base lock.
+ */
+ bc->bound_on = bctimer.base->cpu_base->cpu;
+ } );
return 0;
}
@@ -100,10 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
{
ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
- if (clockevent_state_oneshot(&ce_broadcast_hrtimer))
- if (ce_broadcast_hrtimer.next_event != KTIME_MAX)
- return HRTIMER_RESTART;
-
return HRTIMER_NORESTART;
}
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0ba3c026e685573bd3534c17e27da7c505ac99c4 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert(a)gondor.apana.org.au>
Date: Fri, 6 Sep 2019 13:13:06 +1000
Subject: [PATCH] crypto: skcipher - Unmap pages after an external error
skcipher_walk_done may be called with an error by internal or
external callers. For those internal callers we shouldn't unmap
pages but for external callers we must unmap any pages that are
in use.
This patch distinguishes between the two cases by checking whether
walk->nbytes is zero or not. For internal callers, we now set
walk->nbytes to zero prior to the call. For external callers,
walk->nbytes has always been non-zero (as zero is used to indicate
the termination of a walk).
Reported-by: Ard Biesheuvel <ard.biesheuvel(a)linaro.org>
Fixes: 5cde0af2a982 ("[CRYPTO] cipher: Added block cipher type")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
Tested-by: Ard Biesheuvel <ard.biesheuvel(a)linaro.org>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 5d836fc3df3e..22753c1c7202 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -90,7 +90,7 @@ static inline u8 *skcipher_get_spot(u8 *start, unsigned int len)
return max(start, end_page);
}
-static void skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize)
+static int skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize)
{
u8 *addr;
@@ -98,19 +98,21 @@ static void skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize)
addr = skcipher_get_spot(addr, bsize);
scatterwalk_copychunks(addr, &walk->out, bsize,
(walk->flags & SKCIPHER_WALK_PHYS) ? 2 : 1);
+ return 0;
}
int skcipher_walk_done(struct skcipher_walk *walk, int err)
{
- unsigned int n; /* bytes processed */
- bool more;
+ unsigned int n = walk->nbytes;
+ unsigned int nbytes = 0;
- if (unlikely(err < 0))
+ if (!n)
goto finish;
- n = walk->nbytes - err;
- walk->total -= n;
- more = (walk->total != 0);
+ if (likely(err >= 0)) {
+ n -= err;
+ nbytes = walk->total - n;
+ }
if (likely(!(walk->flags & (SKCIPHER_WALK_PHYS |
SKCIPHER_WALK_SLOW |
@@ -126,7 +128,7 @@ int skcipher_walk_done(struct skcipher_walk *walk, int err)
memcpy(walk->dst.virt.addr, walk->page, n);
skcipher_unmap_dst(walk);
} else if (unlikely(walk->flags & SKCIPHER_WALK_SLOW)) {
- if (err) {
+ if (err > 0) {
/*
* Didn't process all bytes. Either the algorithm is
* broken, or this was the last step and it turned out
@@ -134,27 +136,29 @@ int skcipher_walk_done(struct skcipher_walk *walk, int err)
* the algorithm requires it.
*/
err = -EINVAL;
- goto finish;
- }
- skcipher_done_slow(walk, n);
- goto already_advanced;
+ nbytes = 0;
+ } else
+ n = skcipher_done_slow(walk, n);
}
+ if (err > 0)
+ err = 0;
+
+ walk->total = nbytes;
+ walk->nbytes = 0;
+
scatterwalk_advance(&walk->in, n);
scatterwalk_advance(&walk->out, n);
-already_advanced:
- scatterwalk_done(&walk->in, 0, more);
- scatterwalk_done(&walk->out, 1, more);
+ scatterwalk_done(&walk->in, 0, nbytes);
+ scatterwalk_done(&walk->out, 1, nbytes);
- if (more) {
+ if (nbytes) {
crypto_yield(walk->flags & SKCIPHER_WALK_SLEEP ?
CRYPTO_TFM_REQ_MAY_SLEEP : 0);
return skcipher_walk_next(walk);
}
- err = 0;
-finish:
- walk->nbytes = 0;
+finish:
/* Short-circuit for the common/fast path. */
if (!((unsigned long)walk->buffer | (unsigned long)walk->page))
goto out;
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0ba3c026e685573bd3534c17e27da7c505ac99c4 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert(a)gondor.apana.org.au>
Date: Fri, 6 Sep 2019 13:13:06 +1000
Subject: [PATCH] crypto: skcipher - Unmap pages after an external error
skcipher_walk_done may be called with an error by internal or
external callers. For those internal callers we shouldn't unmap
pages but for external callers we must unmap any pages that are
in use.
This patch distinguishes between the two cases by checking whether
walk->nbytes is zero or not. For internal callers, we now set
walk->nbytes to zero prior to the call. For external callers,
walk->nbytes has always been non-zero (as zero is used to indicate
the termination of a walk).
Reported-by: Ard Biesheuvel <ard.biesheuvel(a)linaro.org>
Fixes: 5cde0af2a982 ("[CRYPTO] cipher: Added block cipher type")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
Tested-by: Ard Biesheuvel <ard.biesheuvel(a)linaro.org>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 5d836fc3df3e..22753c1c7202 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -90,7 +90,7 @@ static inline u8 *skcipher_get_spot(u8 *start, unsigned int len)
return max(start, end_page);
}
-static void skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize)
+static int skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize)
{
u8 *addr;
@@ -98,19 +98,21 @@ static void skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize)
addr = skcipher_get_spot(addr, bsize);
scatterwalk_copychunks(addr, &walk->out, bsize,
(walk->flags & SKCIPHER_WALK_PHYS) ? 2 : 1);
+ return 0;
}
int skcipher_walk_done(struct skcipher_walk *walk, int err)
{
- unsigned int n; /* bytes processed */
- bool more;
+ unsigned int n = walk->nbytes;
+ unsigned int nbytes = 0;
- if (unlikely(err < 0))
+ if (!n)
goto finish;
- n = walk->nbytes - err;
- walk->total -= n;
- more = (walk->total != 0);
+ if (likely(err >= 0)) {
+ n -= err;
+ nbytes = walk->total - n;
+ }
if (likely(!(walk->flags & (SKCIPHER_WALK_PHYS |
SKCIPHER_WALK_SLOW |
@@ -126,7 +128,7 @@ int skcipher_walk_done(struct skcipher_walk *walk, int err)
memcpy(walk->dst.virt.addr, walk->page, n);
skcipher_unmap_dst(walk);
} else if (unlikely(walk->flags & SKCIPHER_WALK_SLOW)) {
- if (err) {
+ if (err > 0) {
/*
* Didn't process all bytes. Either the algorithm is
* broken, or this was the last step and it turned out
@@ -134,27 +136,29 @@ int skcipher_walk_done(struct skcipher_walk *walk, int err)
* the algorithm requires it.
*/
err = -EINVAL;
- goto finish;
- }
- skcipher_done_slow(walk, n);
- goto already_advanced;
+ nbytes = 0;
+ } else
+ n = skcipher_done_slow(walk, n);
}
+ if (err > 0)
+ err = 0;
+
+ walk->total = nbytes;
+ walk->nbytes = 0;
+
scatterwalk_advance(&walk->in, n);
scatterwalk_advance(&walk->out, n);
-already_advanced:
- scatterwalk_done(&walk->in, 0, more);
- scatterwalk_done(&walk->out, 1, more);
+ scatterwalk_done(&walk->in, 0, nbytes);
+ scatterwalk_done(&walk->out, 1, nbytes);
- if (more) {
+ if (nbytes) {
crypto_yield(walk->flags & SKCIPHER_WALK_SLEEP ?
CRYPTO_TFM_REQ_MAY_SLEEP : 0);
return skcipher_walk_next(walk);
}
- err = 0;
-finish:
- walk->nbytes = 0;
+finish:
/* Short-circuit for the common/fast path. */
if (!((unsigned long)walk->buffer | (unsigned long)walk->page))
goto out;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b0215e2d6a18d8331b2d4a8b38ccf3eff783edb1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt(a)goodmis.org>
Date: Wed, 28 Aug 2019 15:05:28 -0400
Subject: [PATCH] tools lib traceevent: Do not free tep->cmdlines in
add_new_comm() on failure
If the re-allocation of tep->cmdlines succeeds, then the previous
allocation of tep->cmdlines will be freed. If we later fail in
add_new_comm(), we must not free cmdlines, and also should assign
tep->cmdlines to the new allocation. Otherwise when freeing tep, the
tep->cmdlines will be pointing to garbage.
Fixes: a6d2a61ac653a ("tools lib traceevent: Remove some die() calls")
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Jiri Olsa <jolsa(a)redhat.com>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: linux-trace-devel(a)vger.kernel.org
Cc: stable(a)vger.kernel.org
Link: http://lkml.kernel.org/r/20190828191819.970121417@goodmis.org
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index b36b536a9fcb..13fd9fdf91e0 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -269,10 +269,10 @@ static int add_new_comm(struct tep_handle *tep,
errno = ENOMEM;
return -1;
}
+ tep->cmdlines = cmdlines;
cmdlines[tep->cmdline_count].comm = strdup(comm);
if (!cmdlines[tep->cmdline_count].comm) {
- free(cmdlines);
errno = ENOMEM;
return -1;
}
@@ -283,7 +283,6 @@ static int add_new_comm(struct tep_handle *tep,
tep->cmdline_count++;
qsort(cmdlines, tep->cmdline_count, sizeof(*cmdlines), cmdline_cmp);
- tep->cmdlines = cmdlines;
return 0;
}
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b0215e2d6a18d8331b2d4a8b38ccf3eff783edb1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt(a)goodmis.org>
Date: Wed, 28 Aug 2019 15:05:28 -0400
Subject: [PATCH] tools lib traceevent: Do not free tep->cmdlines in
add_new_comm() on failure
If the re-allocation of tep->cmdlines succeeds, then the previous
allocation of tep->cmdlines will be freed. If we later fail in
add_new_comm(), we must not free cmdlines, and also should assign
tep->cmdlines to the new allocation. Otherwise when freeing tep, the
tep->cmdlines will be pointing to garbage.
Fixes: a6d2a61ac653a ("tools lib traceevent: Remove some die() calls")
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Jiri Olsa <jolsa(a)redhat.com>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: linux-trace-devel(a)vger.kernel.org
Cc: stable(a)vger.kernel.org
Link: http://lkml.kernel.org/r/20190828191819.970121417@goodmis.org
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index b36b536a9fcb..13fd9fdf91e0 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -269,10 +269,10 @@ static int add_new_comm(struct tep_handle *tep,
errno = ENOMEM;
return -1;
}
+ tep->cmdlines = cmdlines;
cmdlines[tep->cmdline_count].comm = strdup(comm);
if (!cmdlines[tep->cmdline_count].comm) {
- free(cmdlines);
errno = ENOMEM;
return -1;
}
@@ -283,7 +283,6 @@ static int add_new_comm(struct tep_handle *tep,
tep->cmdline_count++;
qsort(cmdlines, tep->cmdline_count, sizeof(*cmdlines), cmdline_cmp);
- tep->cmdlines = cmdlines;
return 0;
}
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b0215e2d6a18d8331b2d4a8b38ccf3eff783edb1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt(a)goodmis.org>
Date: Wed, 28 Aug 2019 15:05:28 -0400
Subject: [PATCH] tools lib traceevent: Do not free tep->cmdlines in
add_new_comm() on failure
If the re-allocation of tep->cmdlines succeeds, then the previous
allocation of tep->cmdlines will be freed. If we later fail in
add_new_comm(), we must not free cmdlines, and also should assign
tep->cmdlines to the new allocation. Otherwise when freeing tep, the
tep->cmdlines will be pointing to garbage.
Fixes: a6d2a61ac653a ("tools lib traceevent: Remove some die() calls")
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Jiri Olsa <jolsa(a)redhat.com>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: linux-trace-devel(a)vger.kernel.org
Cc: stable(a)vger.kernel.org
Link: http://lkml.kernel.org/r/20190828191819.970121417@goodmis.org
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index b36b536a9fcb..13fd9fdf91e0 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -269,10 +269,10 @@ static int add_new_comm(struct tep_handle *tep,
errno = ENOMEM;
return -1;
}
+ tep->cmdlines = cmdlines;
cmdlines[tep->cmdline_count].comm = strdup(comm);
if (!cmdlines[tep->cmdline_count].comm) {
- free(cmdlines);
errno = ENOMEM;
return -1;
}
@@ -283,7 +283,6 @@ static int add_new_comm(struct tep_handle *tep,
tep->cmdline_count++;
qsort(cmdlines, tep->cmdline_count, sizeof(*cmdlines), cmdline_cmp);
- tep->cmdlines = cmdlines;
return 0;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From c784be435d5dae28d3b03db31753dd7a18733f0c Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego(a)linux.vnet.ibm.com>
Date: Wed, 15 May 2019 13:15:52 +0530
Subject: [PATCH] powerpc/pseries: Fix cpu_hotplug_lock acquisition in
resize_hpt()
The calls to arch_add_memory()/arch_remove_memory() are always made
with the read-side cpu_hotplug_lock acquired via memory_hotplug_begin().
On pSeries, arch_add_memory()/arch_remove_memory() eventually call
resize_hpt() which in turn calls stop_machine() which acquires the
read-side cpu_hotplug_lock again, thereby resulting in the recursive
acquisition of this lock.
In the absence of CONFIG_PROVE_LOCKING, we hadn't observed a system
lockup during a memory hotplug operation because cpus_read_lock() is a
per-cpu rwsem read, which, in the fast-path (in the absence of the
writer, which in our case is a CPU-hotplug operation) simply
increments the read_count on the semaphore. Thus a recursive read in
the fast-path doesn't cause any problems.
However, we can hit this problem in practice if there is a concurrent
CPU-Hotplug operation in progress which is waiting to acquire the
write-side of the lock. This will cause the second recursive read to
block until the writer finishes. While the writer is blocked since the
first read holds the lock. Thus both the reader as well as the writers
fail to make any progress thereby blocking both CPU-Hotplug as well as
Memory Hotplug operations.
Memory-Hotplug CPU-Hotplug
CPU 0 CPU 1
------ ------
1. down_read(cpu_hotplug_lock.rw_sem)
[memory_hotplug_begin]
2. down_write(cpu_hotplug_lock.rw_sem)
[cpu_up/cpu_down]
3. down_read(cpu_hotplug_lock.rw_sem)
[stop_machine()]
Lockdep complains as follows in these code-paths.
swapper/0/1 is trying to acquire lock:
(____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: stop_machine+0x2c/0x60
but task is already holding lock:
(____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: mem_hotplug_begin+0x20/0x50
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(cpu_hotplug_lock.rw_sem);
lock(cpu_hotplug_lock.rw_sem);
*** DEADLOCK ***
May be due to missing lock nesting notation
3 locks held by swapper/0/1:
#0: (____ptrval____) (&dev->mutex){....}, at: __driver_attach+0x12c/0x1b0
#1: (____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: mem_hotplug_begin+0x20/0x50
#2: (____ptrval____) (mem_hotplug_lock.rw_sem){++++}, at: percpu_down_write+0x54/0x1a0
stack backtrace:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.0.0-rc5-58373-gbc99402235f3-dirty #166
Call Trace:
dump_stack+0xe8/0x164 (unreliable)
__lock_acquire+0x1110/0x1c70
lock_acquire+0x240/0x290
cpus_read_lock+0x64/0xf0
stop_machine+0x2c/0x60
pseries_lpar_resize_hpt+0x19c/0x2c0
resize_hpt_for_hotplug+0x70/0xd0
arch_add_memory+0x58/0xfc
devm_memremap_pages+0x5e8/0x8f0
pmem_attach_disk+0x764/0x830
nvdimm_bus_probe+0x118/0x240
really_probe+0x230/0x4b0
driver_probe_device+0x16c/0x1e0
__driver_attach+0x148/0x1b0
bus_for_each_dev+0x90/0x130
driver_attach+0x34/0x50
bus_add_driver+0x1a8/0x360
driver_register+0x108/0x170
__nd_driver_register+0xd0/0xf0
nd_pmem_driver_init+0x34/0x48
do_one_initcall+0x1e0/0x45c
kernel_init_freeable+0x540/0x64c
kernel_init+0x2c/0x160
ret_from_kernel_thread+0x5c/0x68
Fix this issue by
1) Requiring all the calls to pseries_lpar_resize_hpt() be made
with cpu_hotplug_lock held.
2) In pseries_lpar_resize_hpt() invoke stop_machine_cpuslocked()
as a consequence of 1)
3) To satisfy 1), in hpt_order_set(), call mmu_hash_ops.resize_hpt()
with cpu_hotplug_lock held.
Fixes: dbcf929c0062 ("powerpc/pseries: Add support for hash table resizing")
Cc: stable(a)vger.kernel.org # v4.11+
Reported-by: Aneesh Kumar K.V <aneesh.kumar(a)linux.ibm.com>
Signed-off-by: Gautham R. Shenoy <ego(a)linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/1557906352-29048-1-git-send-email-ego@linux.vnet.…
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index e6d471058597..c363e850550e 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -34,6 +34,7 @@
#include <linux/libfdt.h>
#include <linux/pkeys.h>
#include <linux/hugetlb.h>
+#include <linux/cpu.h>
#include <asm/debugfs.h>
#include <asm/processor.h>
@@ -1931,10 +1932,16 @@ static int hpt_order_get(void *data, u64 *val)
static int hpt_order_set(void *data, u64 val)
{
+ int ret;
+
if (!mmu_hash_ops.resize_hpt)
return -ENODEV;
- return mmu_hash_ops.resize_hpt(val);
+ cpus_read_lock();
+ ret = mmu_hash_ops.resize_hpt(val);
+ cpus_read_unlock();
+
+ return ret;
}
DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 09bb878c21e0..4f76e5f30c97 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -1413,7 +1413,10 @@ static int pseries_lpar_resize_hpt_commit(void *data)
return 0;
}
-/* Must be called in user context */
+/*
+ * Must be called in process context. The caller must hold the
+ * cpus_lock.
+ */
static int pseries_lpar_resize_hpt(unsigned long shift)
{
struct hpt_resize_state state = {
@@ -1467,7 +1470,8 @@ static int pseries_lpar_resize_hpt(unsigned long shift)
t1 = ktime_get();
- rc = stop_machine(pseries_lpar_resize_hpt_commit, &state, NULL);
+ rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit,
+ &state, NULL);
t2 = ktime_get();
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From c784be435d5dae28d3b03db31753dd7a18733f0c Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego(a)linux.vnet.ibm.com>
Date: Wed, 15 May 2019 13:15:52 +0530
Subject: [PATCH] powerpc/pseries: Fix cpu_hotplug_lock acquisition in
resize_hpt()
The calls to arch_add_memory()/arch_remove_memory() are always made
with the read-side cpu_hotplug_lock acquired via memory_hotplug_begin().
On pSeries, arch_add_memory()/arch_remove_memory() eventually call
resize_hpt() which in turn calls stop_machine() which acquires the
read-side cpu_hotplug_lock again, thereby resulting in the recursive
acquisition of this lock.
In the absence of CONFIG_PROVE_LOCKING, we hadn't observed a system
lockup during a memory hotplug operation because cpus_read_lock() is a
per-cpu rwsem read, which, in the fast-path (in the absence of the
writer, which in our case is a CPU-hotplug operation) simply
increments the read_count on the semaphore. Thus a recursive read in
the fast-path doesn't cause any problems.
However, we can hit this problem in practice if there is a concurrent
CPU-Hotplug operation in progress which is waiting to acquire the
write-side of the lock. This will cause the second recursive read to
block until the writer finishes. While the writer is blocked since the
first read holds the lock. Thus both the reader as well as the writers
fail to make any progress thereby blocking both CPU-Hotplug as well as
Memory Hotplug operations.
Memory-Hotplug CPU-Hotplug
CPU 0 CPU 1
------ ------
1. down_read(cpu_hotplug_lock.rw_sem)
[memory_hotplug_begin]
2. down_write(cpu_hotplug_lock.rw_sem)
[cpu_up/cpu_down]
3. down_read(cpu_hotplug_lock.rw_sem)
[stop_machine()]
Lockdep complains as follows in these code-paths.
swapper/0/1 is trying to acquire lock:
(____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: stop_machine+0x2c/0x60
but task is already holding lock:
(____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: mem_hotplug_begin+0x20/0x50
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(cpu_hotplug_lock.rw_sem);
lock(cpu_hotplug_lock.rw_sem);
*** DEADLOCK ***
May be due to missing lock nesting notation
3 locks held by swapper/0/1:
#0: (____ptrval____) (&dev->mutex){....}, at: __driver_attach+0x12c/0x1b0
#1: (____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: mem_hotplug_begin+0x20/0x50
#2: (____ptrval____) (mem_hotplug_lock.rw_sem){++++}, at: percpu_down_write+0x54/0x1a0
stack backtrace:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.0.0-rc5-58373-gbc99402235f3-dirty #166
Call Trace:
dump_stack+0xe8/0x164 (unreliable)
__lock_acquire+0x1110/0x1c70
lock_acquire+0x240/0x290
cpus_read_lock+0x64/0xf0
stop_machine+0x2c/0x60
pseries_lpar_resize_hpt+0x19c/0x2c0
resize_hpt_for_hotplug+0x70/0xd0
arch_add_memory+0x58/0xfc
devm_memremap_pages+0x5e8/0x8f0
pmem_attach_disk+0x764/0x830
nvdimm_bus_probe+0x118/0x240
really_probe+0x230/0x4b0
driver_probe_device+0x16c/0x1e0
__driver_attach+0x148/0x1b0
bus_for_each_dev+0x90/0x130
driver_attach+0x34/0x50
bus_add_driver+0x1a8/0x360
driver_register+0x108/0x170
__nd_driver_register+0xd0/0xf0
nd_pmem_driver_init+0x34/0x48
do_one_initcall+0x1e0/0x45c
kernel_init_freeable+0x540/0x64c
kernel_init+0x2c/0x160
ret_from_kernel_thread+0x5c/0x68
Fix this issue by
1) Requiring all the calls to pseries_lpar_resize_hpt() be made
with cpu_hotplug_lock held.
2) In pseries_lpar_resize_hpt() invoke stop_machine_cpuslocked()
as a consequence of 1)
3) To satisfy 1), in hpt_order_set(), call mmu_hash_ops.resize_hpt()
with cpu_hotplug_lock held.
Fixes: dbcf929c0062 ("powerpc/pseries: Add support for hash table resizing")
Cc: stable(a)vger.kernel.org # v4.11+
Reported-by: Aneesh Kumar K.V <aneesh.kumar(a)linux.ibm.com>
Signed-off-by: Gautham R. Shenoy <ego(a)linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/1557906352-29048-1-git-send-email-ego@linux.vnet.…
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index e6d471058597..c363e850550e 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -34,6 +34,7 @@
#include <linux/libfdt.h>
#include <linux/pkeys.h>
#include <linux/hugetlb.h>
+#include <linux/cpu.h>
#include <asm/debugfs.h>
#include <asm/processor.h>
@@ -1931,10 +1932,16 @@ static int hpt_order_get(void *data, u64 *val)
static int hpt_order_set(void *data, u64 val)
{
+ int ret;
+
if (!mmu_hash_ops.resize_hpt)
return -ENODEV;
- return mmu_hash_ops.resize_hpt(val);
+ cpus_read_lock();
+ ret = mmu_hash_ops.resize_hpt(val);
+ cpus_read_unlock();
+
+ return ret;
}
DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 09bb878c21e0..4f76e5f30c97 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -1413,7 +1413,10 @@ static int pseries_lpar_resize_hpt_commit(void *data)
return 0;
}
-/* Must be called in user context */
+/*
+ * Must be called in process context. The caller must hold the
+ * cpus_lock.
+ */
static int pseries_lpar_resize_hpt(unsigned long shift)
{
struct hpt_resize_state state = {
@@ -1467,7 +1470,8 @@ static int pseries_lpar_resize_hpt(unsigned long shift)
t1 = ktime_get();
- rc = stop_machine(pseries_lpar_resize_hpt_commit, &state, NULL);
+ rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit,
+ &state, NULL);
t2 = ktime_get();
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From da15c03b047dca891d37b9f4ef9ca14d84a6484f Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus(a)ozlabs.org>
Date: Tue, 13 Aug 2019 20:06:48 +1000
Subject: [PATCH] powerpc/xive: Implement get_irqchip_state method for XIVE to
fix shutdown race
Testing has revealed the existence of a race condition where a XIVE
interrupt being shut down can be in one of the XIVE interrupt queues
(of which there are up to 8 per CPU, one for each priority) at the
point where free_irq() is called. If this happens, can return an
interrupt number which has been shut down. This can lead to various
symptoms:
- irq_to_desc(irq) can be NULL. In this case, no end-of-interrupt
function gets called, resulting in the CPU's elevated interrupt
priority (numerically lowered CPPR) never gets reset. That then
means that the CPU stops processing interrupts, causing device
timeouts and other errors in various device drivers.
- The irq descriptor or related data structures can be in the process
of being freed as the interrupt code is using them. This typically
leads to crashes due to bad pointer dereferences.
This race is basically what commit 62e0468650c3 ("genirq: Add optional
hardware synchronization for shutdown", 2019-06-28) is intended to
fix, given a get_irqchip_state() method for the interrupt controller
being used. It works by polling the interrupt controller when an
interrupt is being freed until the controller says it is not pending.
With XIVE, the PQ bits of the interrupt source indicate the state of
the interrupt source, and in particular the P bit goes from 0 to 1 at
the point where the hardware writes an entry into the interrupt queue
that this interrupt is directed towards. Normally, the code will then
process the interrupt and do an end-of-interrupt (EOI) operation which
will reset PQ to 00 (assuming another interrupt hasn't been generated
in the meantime). However, there are situations where the code resets
P even though a queue entry exists (for example, by setting PQ to 01,
which disables the interrupt source), and also situations where the
code leaves P at 1 after removing the queue entry (for example, this
is done for escalation interrupts so they cannot fire again until
they are explicitly re-enabled).
The code already has a 'saved_p' flag for the interrupt source which
indicates that a queue entry exists, although it isn't maintained
consistently. This patch adds a 'stale_p' flag to indicate that
P has been left at 1 after processing a queue entry, and adds code
to set and clear saved_p and stale_p as necessary to maintain a
consistent indication of whether a queue entry may or may not exist.
With this, we can implement xive_get_irqchip_state() by looking at
stale_p, saved_p and the ESB PQ bits for the interrupt.
There is some additional code to handle escalation interrupts
properly; because they are enabled and disabled in KVM assembly code,
which does not have access to the xive_irq_data struct for the
escalation interrupt. Hence, stale_p may be incorrect when the
escalation interrupt is freed in kvmppc_xive_{,native_}cleanup_vcpu().
Fortunately, we can fix it up by looking at vcpu->arch.xive_esc_on,
with some careful attention to barriers in order to ensure the correct
result if xive_esc_irq() races with kvmppc_xive_cleanup_vcpu().
Finally, this adds code to make noise on the console (pr_crit and
WARN_ON(1)) if we find an interrupt queue entry for an interrupt
which does not have a descriptor. While this won't catch the race
reliably, if it does get triggered it will be an indication that
the race is occurring and needs to be debugged.
Fixes: 243e25112d06 ("powerpc/xive: Native exploitation of the XIVE interrupt controller")
Cc: stable(a)vger.kernel.org # v4.12+
Signed-off-by: Paul Mackerras <paulus(a)ozlabs.org>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20190813100648.GE9567@blackberry
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index e4016985764e..efb0e597b272 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -46,7 +46,15 @@ struct xive_irq_data {
/* Setup/used by frontend */
int target;
+ /*
+ * saved_p means that there is a queue entry for this interrupt
+ * in some CPU's queue (not including guest vcpu queues), even
+ * if P is not set in the source ESB.
+ * stale_p means that there is no queue entry for this interrupt
+ * in some CPU's queue, even if P is set in the source ESB.
+ */
bool saved_p;
+ bool stale_p;
};
#define XIVE_IRQ_FLAG_STORE_EOI 0x01
#define XIVE_IRQ_FLAG_LSI 0x02
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 586867e46e51..591bfb4bfd0f 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -166,6 +166,9 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
*/
vcpu->arch.xive_esc_on = false;
+ /* This orders xive_esc_on = false vs. subsequent stale_p = true */
+ smp_wmb(); /* goes with smp_mb() in cleanup_single_escalation */
+
return IRQ_HANDLED;
}
@@ -1119,6 +1122,31 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
vcpu->arch.xive_esc_raddr = 0;
}
+/*
+ * In single escalation mode, the escalation interrupt is marked so
+ * that EOI doesn't re-enable it, but just sets the stale_p flag to
+ * indicate that the P bit has already been dealt with. However, the
+ * assembly code that enters the guest sets PQ to 00 without clearing
+ * stale_p (because it has no easy way to address it). Hence we have
+ * to adjust stale_p before shutting down the interrupt.
+ */
+void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
+ struct kvmppc_xive_vcpu *xc, int irq)
+{
+ struct irq_data *d = irq_get_irq_data(irq);
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+ /*
+ * This slightly odd sequence gives the right result
+ * (i.e. stale_p set if xive_esc_on is false) even if
+ * we race with xive_esc_irq() and xive_irq_eoi().
+ */
+ xd->stale_p = false;
+ smp_mb(); /* paired with smb_wmb in xive_esc_irq */
+ if (!vcpu->arch.xive_esc_on)
+ xd->stale_p = true;
+}
+
void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
{
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
@@ -1143,6 +1171,9 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
/* Free escalations */
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
if (xc->esc_virq[i]) {
+ if (xc->xive->single_escalation)
+ xive_cleanup_single_escalation(vcpu, xc,
+ xc->esc_virq[i]);
free_irq(xc->esc_virq[i], vcpu);
irq_dispose_mapping(xc->esc_virq[i]);
kfree(xc->esc_virq_names[i]);
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
index 50494d0ee375..955b820ffd6d 100644
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -282,6 +282,8 @@ int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio);
int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
bool single_escalation);
struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type);
+void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
+ struct kvmppc_xive_vcpu *xc, int irq);
#endif /* CONFIG_KVM_XICS */
#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
index 11b91b46fc39..f0cab43e6f4b 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -71,6 +71,9 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
/* Free the escalation irq */
if (xc->esc_virq[i]) {
+ if (xc->xive->single_escalation)
+ xive_cleanup_single_escalation(vcpu, xc,
+ xc->esc_virq[i]);
free_irq(xc->esc_virq[i], vcpu);
irq_dispose_mapping(xc->esc_virq[i]);
kfree(xc->esc_virq_names[i]);
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index 1cdb39575eae..be86fce1a84e 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -135,7 +135,7 @@ static u32 xive_read_eq(struct xive_q *q, bool just_peek)
static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek)
{
u32 irq = 0;
- u8 prio;
+ u8 prio = 0;
/* Find highest pending priority */
while (xc->pending_prio != 0) {
@@ -148,8 +148,19 @@ static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek)
irq = xive_read_eq(&xc->queue[prio], just_peek);
/* Found something ? That's it */
- if (irq)
- break;
+ if (irq) {
+ if (just_peek || irq_to_desc(irq))
+ break;
+ /*
+ * We should never get here; if we do then we must
+ * have failed to synchronize the interrupt properly
+ * when shutting it down.
+ */
+ pr_crit("xive: got interrupt %d without descriptor, dropping\n",
+ irq);
+ WARN_ON(1);
+ continue;
+ }
/* Clear pending bits */
xc->pending_prio &= ~(1 << prio);
@@ -307,6 +318,7 @@ static void xive_do_queue_eoi(struct xive_cpu *xc)
*/
static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
{
+ xd->stale_p = false;
/* If the XIVE supports the new "store EOI facility, use it */
if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
xive_esb_write(xd, XIVE_ESB_STORE_EOI, 0);
@@ -350,7 +362,7 @@ static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
}
}
-/* irq_chip eoi callback */
+/* irq_chip eoi callback, called with irq descriptor lock held */
static void xive_irq_eoi(struct irq_data *d)
{
struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
@@ -366,6 +378,8 @@ static void xive_irq_eoi(struct irq_data *d)
if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d) &&
!(xd->flags & XIVE_IRQ_NO_EOI))
xive_do_source_eoi(irqd_to_hwirq(d), xd);
+ else
+ xd->stale_p = true;
/*
* Clear saved_p to indicate that it's no longer occupying
@@ -397,11 +411,16 @@ static void xive_do_source_set_mask(struct xive_irq_data *xd,
*/
if (mask) {
val = xive_esb_read(xd, XIVE_ESB_SET_PQ_01);
- xd->saved_p = !!(val & XIVE_ESB_VAL_P);
- } else if (xd->saved_p)
+ if (!xd->stale_p && !!(val & XIVE_ESB_VAL_P))
+ xd->saved_p = true;
+ xd->stale_p = false;
+ } else if (xd->saved_p) {
xive_esb_read(xd, XIVE_ESB_SET_PQ_10);
- else
+ xd->saved_p = false;
+ } else {
xive_esb_read(xd, XIVE_ESB_SET_PQ_00);
+ xd->stale_p = false;
+ }
}
/*
@@ -541,6 +560,8 @@ static unsigned int xive_irq_startup(struct irq_data *d)
unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
int target, rc;
+ xd->saved_p = false;
+ xd->stale_p = false;
pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n",
d->irq, hw_irq, d);
@@ -587,6 +608,7 @@ static unsigned int xive_irq_startup(struct irq_data *d)
return 0;
}
+/* called with irq descriptor lock held */
static void xive_irq_shutdown(struct irq_data *d)
{
struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
@@ -601,16 +623,6 @@ static void xive_irq_shutdown(struct irq_data *d)
/* Mask the interrupt at the source */
xive_do_source_set_mask(xd, true);
- /*
- * The above may have set saved_p. We clear it otherwise it
- * will prevent re-enabling later on. It is ok to forget the
- * fact that the interrupt might be in a queue because we are
- * accounting that already in xive_dec_target_count() and will
- * be re-routing it to a new queue with proper accounting when
- * it's started up again
- */
- xd->saved_p = false;
-
/*
* Mask the interrupt in HW in the IVT/EAS and set the number
* to be the "bad" IRQ number
@@ -797,6 +809,10 @@ static int xive_irq_retrigger(struct irq_data *d)
return 1;
}
+/*
+ * Caller holds the irq descriptor lock, so this won't be called
+ * concurrently with xive_get_irqchip_state on the same interrupt.
+ */
static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
{
struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
@@ -820,6 +836,10 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
/* Set it to PQ=10 state to prevent further sends */
pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_10);
+ if (!xd->stale_p) {
+ xd->saved_p = !!(pq & XIVE_ESB_VAL_P);
+ xd->stale_p = !xd->saved_p;
+ }
/* No target ? nothing to do */
if (xd->target == XIVE_INVALID_TARGET) {
@@ -827,7 +847,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
* An untargetted interrupt should have been
* also masked at the source
*/
- WARN_ON(pq & 2);
+ WARN_ON(xd->saved_p);
return 0;
}
@@ -847,9 +867,8 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
* This saved_p is cleared by the host EOI, when we know
* for sure the queue slot is no longer in use.
*/
- if (pq & 2) {
- pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_11);
- xd->saved_p = true;
+ if (xd->saved_p) {
+ xive_esb_read(xd, XIVE_ESB_SET_PQ_11);
/*
* Sync the XIVE source HW to ensure the interrupt
@@ -862,8 +881,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
*/
if (xive_ops->sync_source)
xive_ops->sync_source(hw_irq);
- } else
- xd->saved_p = false;
+ }
} else {
irqd_clr_forwarded_to_vcpu(d);
@@ -914,6 +932,23 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
return 0;
}
+/* Called with irq descriptor lock held. */
+static int xive_get_irqchip_state(struct irq_data *data,
+ enum irqchip_irq_state which, bool *state)
+{
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(data);
+
+ switch (which) {
+ case IRQCHIP_STATE_ACTIVE:
+ *state = !xd->stale_p &&
+ (xd->saved_p ||
+ !!(xive_esb_read(xd, XIVE_ESB_GET) & XIVE_ESB_VAL_P));
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
static struct irq_chip xive_irq_chip = {
.name = "XIVE-IRQ",
.irq_startup = xive_irq_startup,
@@ -925,6 +960,7 @@ static struct irq_chip xive_irq_chip = {
.irq_set_type = xive_irq_set_type,
.irq_retrigger = xive_irq_retrigger,
.irq_set_vcpu_affinity = xive_irq_set_vcpu_affinity,
+ .irq_get_irqchip_state = xive_get_irqchip_state,
};
bool is_xive_irq(struct irq_chip *chip)
@@ -1337,6 +1373,11 @@ static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc)
raw_spin_lock(&desc->lock);
xd = irq_desc_get_handler_data(desc);
+ /*
+ * Clear saved_p to indicate that it's no longer pending
+ */
+ xd->saved_p = false;
+
/*
* For LSIs, we EOI, this will cause a resend if it's
* still asserted. Otherwise do an MSI retrigger.
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From da15c03b047dca891d37b9f4ef9ca14d84a6484f Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus(a)ozlabs.org>
Date: Tue, 13 Aug 2019 20:06:48 +1000
Subject: [PATCH] powerpc/xive: Implement get_irqchip_state method for XIVE to
fix shutdown race
Testing has revealed the existence of a race condition where a XIVE
interrupt being shut down can be in one of the XIVE interrupt queues
(of which there are up to 8 per CPU, one for each priority) at the
point where free_irq() is called. If this happens, can return an
interrupt number which has been shut down. This can lead to various
symptoms:
- irq_to_desc(irq) can be NULL. In this case, no end-of-interrupt
function gets called, resulting in the CPU's elevated interrupt
priority (numerically lowered CPPR) never gets reset. That then
means that the CPU stops processing interrupts, causing device
timeouts and other errors in various device drivers.
- The irq descriptor or related data structures can be in the process
of being freed as the interrupt code is using them. This typically
leads to crashes due to bad pointer dereferences.
This race is basically what commit 62e0468650c3 ("genirq: Add optional
hardware synchronization for shutdown", 2019-06-28) is intended to
fix, given a get_irqchip_state() method for the interrupt controller
being used. It works by polling the interrupt controller when an
interrupt is being freed until the controller says it is not pending.
With XIVE, the PQ bits of the interrupt source indicate the state of
the interrupt source, and in particular the P bit goes from 0 to 1 at
the point where the hardware writes an entry into the interrupt queue
that this interrupt is directed towards. Normally, the code will then
process the interrupt and do an end-of-interrupt (EOI) operation which
will reset PQ to 00 (assuming another interrupt hasn't been generated
in the meantime). However, there are situations where the code resets
P even though a queue entry exists (for example, by setting PQ to 01,
which disables the interrupt source), and also situations where the
code leaves P at 1 after removing the queue entry (for example, this
is done for escalation interrupts so they cannot fire again until
they are explicitly re-enabled).
The code already has a 'saved_p' flag for the interrupt source which
indicates that a queue entry exists, although it isn't maintained
consistently. This patch adds a 'stale_p' flag to indicate that
P has been left at 1 after processing a queue entry, and adds code
to set and clear saved_p and stale_p as necessary to maintain a
consistent indication of whether a queue entry may or may not exist.
With this, we can implement xive_get_irqchip_state() by looking at
stale_p, saved_p and the ESB PQ bits for the interrupt.
There is some additional code to handle escalation interrupts
properly; because they are enabled and disabled in KVM assembly code,
which does not have access to the xive_irq_data struct for the
escalation interrupt. Hence, stale_p may be incorrect when the
escalation interrupt is freed in kvmppc_xive_{,native_}cleanup_vcpu().
Fortunately, we can fix it up by looking at vcpu->arch.xive_esc_on,
with some careful attention to barriers in order to ensure the correct
result if xive_esc_irq() races with kvmppc_xive_cleanup_vcpu().
Finally, this adds code to make noise on the console (pr_crit and
WARN_ON(1)) if we find an interrupt queue entry for an interrupt
which does not have a descriptor. While this won't catch the race
reliably, if it does get triggered it will be an indication that
the race is occurring and needs to be debugged.
Fixes: 243e25112d06 ("powerpc/xive: Native exploitation of the XIVE interrupt controller")
Cc: stable(a)vger.kernel.org # v4.12+
Signed-off-by: Paul Mackerras <paulus(a)ozlabs.org>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20190813100648.GE9567@blackberry
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index e4016985764e..efb0e597b272 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -46,7 +46,15 @@ struct xive_irq_data {
/* Setup/used by frontend */
int target;
+ /*
+ * saved_p means that there is a queue entry for this interrupt
+ * in some CPU's queue (not including guest vcpu queues), even
+ * if P is not set in the source ESB.
+ * stale_p means that there is no queue entry for this interrupt
+ * in some CPU's queue, even if P is set in the source ESB.
+ */
bool saved_p;
+ bool stale_p;
};
#define XIVE_IRQ_FLAG_STORE_EOI 0x01
#define XIVE_IRQ_FLAG_LSI 0x02
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 586867e46e51..591bfb4bfd0f 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -166,6 +166,9 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
*/
vcpu->arch.xive_esc_on = false;
+ /* This orders xive_esc_on = false vs. subsequent stale_p = true */
+ smp_wmb(); /* goes with smp_mb() in cleanup_single_escalation */
+
return IRQ_HANDLED;
}
@@ -1119,6 +1122,31 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
vcpu->arch.xive_esc_raddr = 0;
}
+/*
+ * In single escalation mode, the escalation interrupt is marked so
+ * that EOI doesn't re-enable it, but just sets the stale_p flag to
+ * indicate that the P bit has already been dealt with. However, the
+ * assembly code that enters the guest sets PQ to 00 without clearing
+ * stale_p (because it has no easy way to address it). Hence we have
+ * to adjust stale_p before shutting down the interrupt.
+ */
+void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
+ struct kvmppc_xive_vcpu *xc, int irq)
+{
+ struct irq_data *d = irq_get_irq_data(irq);
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+ /*
+ * This slightly odd sequence gives the right result
+ * (i.e. stale_p set if xive_esc_on is false) even if
+ * we race with xive_esc_irq() and xive_irq_eoi().
+ */
+ xd->stale_p = false;
+ smp_mb(); /* paired with smb_wmb in xive_esc_irq */
+ if (!vcpu->arch.xive_esc_on)
+ xd->stale_p = true;
+}
+
void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
{
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
@@ -1143,6 +1171,9 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
/* Free escalations */
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
if (xc->esc_virq[i]) {
+ if (xc->xive->single_escalation)
+ xive_cleanup_single_escalation(vcpu, xc,
+ xc->esc_virq[i]);
free_irq(xc->esc_virq[i], vcpu);
irq_dispose_mapping(xc->esc_virq[i]);
kfree(xc->esc_virq_names[i]);
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
index 50494d0ee375..955b820ffd6d 100644
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -282,6 +282,8 @@ int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio);
int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
bool single_escalation);
struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type);
+void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
+ struct kvmppc_xive_vcpu *xc, int irq);
#endif /* CONFIG_KVM_XICS */
#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
index 11b91b46fc39..f0cab43e6f4b 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -71,6 +71,9 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
/* Free the escalation irq */
if (xc->esc_virq[i]) {
+ if (xc->xive->single_escalation)
+ xive_cleanup_single_escalation(vcpu, xc,
+ xc->esc_virq[i]);
free_irq(xc->esc_virq[i], vcpu);
irq_dispose_mapping(xc->esc_virq[i]);
kfree(xc->esc_virq_names[i]);
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index 1cdb39575eae..be86fce1a84e 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -135,7 +135,7 @@ static u32 xive_read_eq(struct xive_q *q, bool just_peek)
static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek)
{
u32 irq = 0;
- u8 prio;
+ u8 prio = 0;
/* Find highest pending priority */
while (xc->pending_prio != 0) {
@@ -148,8 +148,19 @@ static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek)
irq = xive_read_eq(&xc->queue[prio], just_peek);
/* Found something ? That's it */
- if (irq)
- break;
+ if (irq) {
+ if (just_peek || irq_to_desc(irq))
+ break;
+ /*
+ * We should never get here; if we do then we must
+ * have failed to synchronize the interrupt properly
+ * when shutting it down.
+ */
+ pr_crit("xive: got interrupt %d without descriptor, dropping\n",
+ irq);
+ WARN_ON(1);
+ continue;
+ }
/* Clear pending bits */
xc->pending_prio &= ~(1 << prio);
@@ -307,6 +318,7 @@ static void xive_do_queue_eoi(struct xive_cpu *xc)
*/
static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
{
+ xd->stale_p = false;
/* If the XIVE supports the new "store EOI facility, use it */
if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
xive_esb_write(xd, XIVE_ESB_STORE_EOI, 0);
@@ -350,7 +362,7 @@ static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
}
}
-/* irq_chip eoi callback */
+/* irq_chip eoi callback, called with irq descriptor lock held */
static void xive_irq_eoi(struct irq_data *d)
{
struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
@@ -366,6 +378,8 @@ static void xive_irq_eoi(struct irq_data *d)
if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d) &&
!(xd->flags & XIVE_IRQ_NO_EOI))
xive_do_source_eoi(irqd_to_hwirq(d), xd);
+ else
+ xd->stale_p = true;
/*
* Clear saved_p to indicate that it's no longer occupying
@@ -397,11 +411,16 @@ static void xive_do_source_set_mask(struct xive_irq_data *xd,
*/
if (mask) {
val = xive_esb_read(xd, XIVE_ESB_SET_PQ_01);
- xd->saved_p = !!(val & XIVE_ESB_VAL_P);
- } else if (xd->saved_p)
+ if (!xd->stale_p && !!(val & XIVE_ESB_VAL_P))
+ xd->saved_p = true;
+ xd->stale_p = false;
+ } else if (xd->saved_p) {
xive_esb_read(xd, XIVE_ESB_SET_PQ_10);
- else
+ xd->saved_p = false;
+ } else {
xive_esb_read(xd, XIVE_ESB_SET_PQ_00);
+ xd->stale_p = false;
+ }
}
/*
@@ -541,6 +560,8 @@ static unsigned int xive_irq_startup(struct irq_data *d)
unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
int target, rc;
+ xd->saved_p = false;
+ xd->stale_p = false;
pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n",
d->irq, hw_irq, d);
@@ -587,6 +608,7 @@ static unsigned int xive_irq_startup(struct irq_data *d)
return 0;
}
+/* called with irq descriptor lock held */
static void xive_irq_shutdown(struct irq_data *d)
{
struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
@@ -601,16 +623,6 @@ static void xive_irq_shutdown(struct irq_data *d)
/* Mask the interrupt at the source */
xive_do_source_set_mask(xd, true);
- /*
- * The above may have set saved_p. We clear it otherwise it
- * will prevent re-enabling later on. It is ok to forget the
- * fact that the interrupt might be in a queue because we are
- * accounting that already in xive_dec_target_count() and will
- * be re-routing it to a new queue with proper accounting when
- * it's started up again
- */
- xd->saved_p = false;
-
/*
* Mask the interrupt in HW in the IVT/EAS and set the number
* to be the "bad" IRQ number
@@ -797,6 +809,10 @@ static int xive_irq_retrigger(struct irq_data *d)
return 1;
}
+/*
+ * Caller holds the irq descriptor lock, so this won't be called
+ * concurrently with xive_get_irqchip_state on the same interrupt.
+ */
static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
{
struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
@@ -820,6 +836,10 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
/* Set it to PQ=10 state to prevent further sends */
pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_10);
+ if (!xd->stale_p) {
+ xd->saved_p = !!(pq & XIVE_ESB_VAL_P);
+ xd->stale_p = !xd->saved_p;
+ }
/* No target ? nothing to do */
if (xd->target == XIVE_INVALID_TARGET) {
@@ -827,7 +847,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
* An untargetted interrupt should have been
* also masked at the source
*/
- WARN_ON(pq & 2);
+ WARN_ON(xd->saved_p);
return 0;
}
@@ -847,9 +867,8 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
* This saved_p is cleared by the host EOI, when we know
* for sure the queue slot is no longer in use.
*/
- if (pq & 2) {
- pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_11);
- xd->saved_p = true;
+ if (xd->saved_p) {
+ xive_esb_read(xd, XIVE_ESB_SET_PQ_11);
/*
* Sync the XIVE source HW to ensure the interrupt
@@ -862,8 +881,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
*/
if (xive_ops->sync_source)
xive_ops->sync_source(hw_irq);
- } else
- xd->saved_p = false;
+ }
} else {
irqd_clr_forwarded_to_vcpu(d);
@@ -914,6 +932,23 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
return 0;
}
+/* Called with irq descriptor lock held. */
+static int xive_get_irqchip_state(struct irq_data *data,
+ enum irqchip_irq_state which, bool *state)
+{
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(data);
+
+ switch (which) {
+ case IRQCHIP_STATE_ACTIVE:
+ *state = !xd->stale_p &&
+ (xd->saved_p ||
+ !!(xive_esb_read(xd, XIVE_ESB_GET) & XIVE_ESB_VAL_P));
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
static struct irq_chip xive_irq_chip = {
.name = "XIVE-IRQ",
.irq_startup = xive_irq_startup,
@@ -925,6 +960,7 @@ static struct irq_chip xive_irq_chip = {
.irq_set_type = xive_irq_set_type,
.irq_retrigger = xive_irq_retrigger,
.irq_set_vcpu_affinity = xive_irq_set_vcpu_affinity,
+ .irq_get_irqchip_state = xive_get_irqchip_state,
};
bool is_xive_irq(struct irq_chip *chip)
@@ -1337,6 +1373,11 @@ static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc)
raw_spin_lock(&desc->lock);
xd = irq_desc_get_handler_data(desc);
+ /*
+ * Clear saved_p to indicate that it's no longer pending
+ */
+ xd->saved_p = false;
+
/*
* For LSIs, we EOI, this will cause a resend if it's
* still asserted. Otherwise do an MSI retrigger.
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b1f373a11d25fc9a5f7679c9b85799fe09b0dc4a Mon Sep 17 00:00:00 2001
From: Oleksandr Suvorov <oleksandr.suvorov(a)toradex.com>
Date: Fri, 19 Jul 2019 10:05:31 +0000
Subject: [PATCH] ASoC: sgtl5000: Improve VAG power and mute control
VAG power control is improved to fit the manual [1]. This patch fixes as
minimum one bug: if customer muxes Headphone to Line-In right after boot,
the VAG power remains off that leads to poor sound quality from line-in.
I.e. after boot:
- Connect sound source to Line-In jack;
- Connect headphone to HP jack;
- Run following commands:
$ amixer set 'Headphone' 80%
$ amixer set 'Headphone Mux' LINE_IN
Change VAG power on/off control according to the following algorithm:
- turn VAG power ON on the 1st incoming event.
- keep it ON if there is any active VAG consumer (ADC/DAC/HP/Line-In).
- turn VAG power OFF when there is the latest consumer's pre-down event
come.
- always delay after VAG power OFF to avoid pop.
- delay after VAG power ON if the initiative consumer is Line-In, this
prevents pop during line-in muxing.
According to the data sheet [1], to avoid any pops/clicks,
the outputs should be muted during input/output
routing changes.
[1] https://www.nxp.com/docs/en/data-sheet/SGTL5000.pdf
Cc: stable(a)vger.kernel.org
Fixes: 9b34e6cc3bc2 ("ASoC: Add Freescale SGTL5000 codec support")
Signed-off-by: Oleksandr Suvorov <oleksandr.suvorov(a)toradex.com>
Reviewed-by: Marcel Ziswiler <marcel.ziswiler(a)toradex.com>
Reviewed-by: Fabio Estevam <festevam(a)gmail.com>
Reviewed-by: Cezary Rojewski <cezary.rojewski(a)intel.com>
Link: https://lore.kernel.org/r/20190719100524.23300-3-oleksandr.suvorov@toradex.…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
diff --git a/sound/soc/codecs/sgtl5000.c b/sound/soc/codecs/sgtl5000.c
index a6a4748c97f9..34cc85e49003 100644
--- a/sound/soc/codecs/sgtl5000.c
+++ b/sound/soc/codecs/sgtl5000.c
@@ -31,6 +31,13 @@
#define SGTL5000_DAP_REG_OFFSET 0x0100
#define SGTL5000_MAX_REG_OFFSET 0x013A
+/* Delay for the VAG ramp up */
+#define SGTL5000_VAG_POWERUP_DELAY 500 /* ms */
+/* Delay for the VAG ramp down */
+#define SGTL5000_VAG_POWERDOWN_DELAY 500 /* ms */
+
+#define SGTL5000_OUTPUTS_MUTE (SGTL5000_HP_MUTE | SGTL5000_LINE_OUT_MUTE)
+
/* default value of sgtl5000 registers */
static const struct reg_default sgtl5000_reg_defaults[] = {
{ SGTL5000_CHIP_DIG_POWER, 0x0000 },
@@ -123,6 +130,13 @@ enum {
I2S_SCLK_STRENGTH_HIGH,
};
+enum {
+ HP_POWER_EVENT,
+ DAC_POWER_EVENT,
+ ADC_POWER_EVENT,
+ LAST_POWER_EVENT = ADC_POWER_EVENT
+};
+
/* sgtl5000 private structure in codec */
struct sgtl5000_priv {
int sysclk; /* sysclk rate */
@@ -137,8 +151,109 @@ struct sgtl5000_priv {
u8 micbias_voltage;
u8 lrclk_strength;
u8 sclk_strength;
+ u16 mute_state[LAST_POWER_EVENT + 1];
};
+static inline int hp_sel_input(struct snd_soc_component *component)
+{
+ return (snd_soc_component_read32(component, SGTL5000_CHIP_ANA_CTRL) &
+ SGTL5000_HP_SEL_MASK) >> SGTL5000_HP_SEL_SHIFT;
+}
+
+static inline u16 mute_output(struct snd_soc_component *component,
+ u16 mute_mask)
+{
+ u16 mute_reg = snd_soc_component_read32(component,
+ SGTL5000_CHIP_ANA_CTRL);
+
+ snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_CTRL,
+ mute_mask, mute_mask);
+ return mute_reg;
+}
+
+static inline void restore_output(struct snd_soc_component *component,
+ u16 mute_mask, u16 mute_reg)
+{
+ snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_CTRL,
+ mute_mask, mute_reg);
+}
+
+static void vag_power_on(struct snd_soc_component *component, u32 source)
+{
+ if (snd_soc_component_read32(component, SGTL5000_CHIP_ANA_POWER) &
+ SGTL5000_VAG_POWERUP)
+ return;
+
+ snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_POWER,
+ SGTL5000_VAG_POWERUP, SGTL5000_VAG_POWERUP);
+
+ /* When VAG powering on to get local loop from Line-In, the sleep
+ * is required to avoid loud pop.
+ */
+ if (hp_sel_input(component) == SGTL5000_HP_SEL_LINE_IN &&
+ source == HP_POWER_EVENT)
+ msleep(SGTL5000_VAG_POWERUP_DELAY);
+}
+
+static int vag_power_consumers(struct snd_soc_component *component,
+ u16 ana_pwr_reg, u32 source)
+{
+ int consumers = 0;
+
+ /* count dac/adc consumers unconditional */
+ if (ana_pwr_reg & SGTL5000_DAC_POWERUP)
+ consumers++;
+ if (ana_pwr_reg & SGTL5000_ADC_POWERUP)
+ consumers++;
+
+ /*
+ * If the event comes from HP and Line-In is selected,
+ * current action is 'DAC to be powered down'.
+ * As HP_POWERUP is not set when HP muxed to line-in,
+ * we need to keep VAG power ON.
+ */
+ if (source == HP_POWER_EVENT) {
+ if (hp_sel_input(component) == SGTL5000_HP_SEL_LINE_IN)
+ consumers++;
+ } else {
+ if (ana_pwr_reg & SGTL5000_HP_POWERUP)
+ consumers++;
+ }
+
+ return consumers;
+}
+
+static void vag_power_off(struct snd_soc_component *component, u32 source)
+{
+ u16 ana_pwr = snd_soc_component_read32(component,
+ SGTL5000_CHIP_ANA_POWER);
+
+ if (!(ana_pwr & SGTL5000_VAG_POWERUP))
+ return;
+
+ /*
+ * This function calls when any of VAG power consumers is disappearing.
+ * Thus, if there is more than one consumer at the moment, as minimum
+ * one consumer will definitely stay after the end of the current
+ * event.
+ * Don't clear VAG_POWERUP if 2 or more consumers of VAG present:
+ * - LINE_IN (for HP events) / HP (for DAC/ADC events)
+ * - DAC
+ * - ADC
+ * (the current consumer is disappearing right now)
+ */
+ if (vag_power_consumers(component, ana_pwr, source) >= 2)
+ return;
+
+ snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_POWER,
+ SGTL5000_VAG_POWERUP, 0);
+ /* In power down case, we need wait 400-1000 ms
+ * when VAG fully ramped down.
+ * As longer we wait, as smaller pop we've got.
+ */
+ msleep(SGTL5000_VAG_POWERDOWN_DELAY);
+}
+
/*
* mic_bias power on/off share the same register bits with
* output impedance of mic bias, when power on mic bias, we
@@ -170,36 +285,46 @@ static int mic_bias_event(struct snd_soc_dapm_widget *w,
return 0;
}
-/*
- * As manual described, ADC/DAC only works when VAG powerup,
- * So enabled VAG before ADC/DAC up.
- * In power down case, we need wait 400ms when vag fully ramped down.
- */
-static int power_vag_event(struct snd_soc_dapm_widget *w,
- struct snd_kcontrol *kcontrol, int event)
+static int vag_and_mute_control(struct snd_soc_component *component,
+ int event, int event_source)
{
- struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm);
- const u32 mask = SGTL5000_DAC_POWERUP | SGTL5000_ADC_POWERUP;
+ static const u16 mute_mask[] = {
+ /*
+ * Mask for HP_POWER_EVENT.
+ * Muxing Headphones have to be wrapped with mute/unmute
+ * headphones only.
+ */
+ SGTL5000_HP_MUTE,
+ /*
+ * Masks for DAC_POWER_EVENT/ADC_POWER_EVENT.
+ * Muxing DAC or ADC block have to wrapped with mute/unmute
+ * both headphones and line-out.
+ */
+ SGTL5000_OUTPUTS_MUTE,
+ SGTL5000_OUTPUTS_MUTE
+ };
+
+ struct sgtl5000_priv *sgtl5000 =
+ snd_soc_component_get_drvdata(component);
switch (event) {
+ case SND_SOC_DAPM_PRE_PMU:
+ sgtl5000->mute_state[event_source] =
+ mute_output(component, mute_mask[event_source]);
+ break;
case SND_SOC_DAPM_POST_PMU:
- snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_POWER,
- SGTL5000_VAG_POWERUP, SGTL5000_VAG_POWERUP);
- msleep(400);
+ vag_power_on(component, event_source);
+ restore_output(component, mute_mask[event_source],
+ sgtl5000->mute_state[event_source]);
break;
-
case SND_SOC_DAPM_PRE_PMD:
- /*
- * Don't clear VAG_POWERUP, when both DAC and ADC are
- * operational to prevent inadvertently starving the
- * other one of them.
- */
- if ((snd_soc_component_read32(component, SGTL5000_CHIP_ANA_POWER) &
- mask) != mask) {
- snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_POWER,
- SGTL5000_VAG_POWERUP, 0);
- msleep(400);
- }
+ sgtl5000->mute_state[event_source] =
+ mute_output(component, mute_mask[event_source]);
+ vag_power_off(component, event_source);
+ break;
+ case SND_SOC_DAPM_POST_PMD:
+ restore_output(component, mute_mask[event_source],
+ sgtl5000->mute_state[event_source]);
break;
default:
break;
@@ -208,6 +333,41 @@ static int power_vag_event(struct snd_soc_dapm_widget *w,
return 0;
}
+/*
+ * Mute Headphone when power it up/down.
+ * Control VAG power on HP power path.
+ */
+static int headphone_pga_event(struct snd_soc_dapm_widget *w,
+ struct snd_kcontrol *kcontrol, int event)
+{
+ struct snd_soc_component *component =
+ snd_soc_dapm_to_component(w->dapm);
+
+ return vag_and_mute_control(component, event, HP_POWER_EVENT);
+}
+
+/* As manual describes, ADC/DAC powering up/down requires
+ * to mute outputs to avoid pops.
+ * Control VAG power on ADC/DAC power path.
+ */
+static int adc_updown_depop(struct snd_soc_dapm_widget *w,
+ struct snd_kcontrol *kcontrol, int event)
+{
+ struct snd_soc_component *component =
+ snd_soc_dapm_to_component(w->dapm);
+
+ return vag_and_mute_control(component, event, ADC_POWER_EVENT);
+}
+
+static int dac_updown_depop(struct snd_soc_dapm_widget *w,
+ struct snd_kcontrol *kcontrol, int event)
+{
+ struct snd_soc_component *component =
+ snd_soc_dapm_to_component(w->dapm);
+
+ return vag_and_mute_control(component, event, DAC_POWER_EVENT);
+}
+
/* input sources for ADC */
static const char *adc_mux_text[] = {
"MIC_IN", "LINE_IN"
@@ -280,7 +440,10 @@ static const struct snd_soc_dapm_widget sgtl5000_dapm_widgets[] = {
mic_bias_event,
SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD),
- SND_SOC_DAPM_PGA("HP", SGTL5000_CHIP_ANA_POWER, 4, 0, NULL, 0),
+ SND_SOC_DAPM_PGA_E("HP", SGTL5000_CHIP_ANA_POWER, 4, 0, NULL, 0,
+ headphone_pga_event,
+ SND_SOC_DAPM_PRE_POST_PMU |
+ SND_SOC_DAPM_PRE_POST_PMD),
SND_SOC_DAPM_PGA("LO", SGTL5000_CHIP_ANA_POWER, 0, 0, NULL, 0),
SND_SOC_DAPM_MUX("Capture Mux", SND_SOC_NOPM, 0, 0, &adc_mux),
@@ -301,11 +464,12 @@ static const struct snd_soc_dapm_widget sgtl5000_dapm_widgets[] = {
0, SGTL5000_CHIP_DIG_POWER,
1, 0),
- SND_SOC_DAPM_ADC("ADC", "Capture", SGTL5000_CHIP_ANA_POWER, 1, 0),
- SND_SOC_DAPM_DAC("DAC", "Playback", SGTL5000_CHIP_ANA_POWER, 3, 0),
-
- SND_SOC_DAPM_PRE("VAG_POWER_PRE", power_vag_event),
- SND_SOC_DAPM_POST("VAG_POWER_POST", power_vag_event),
+ SND_SOC_DAPM_ADC_E("ADC", "Capture", SGTL5000_CHIP_ANA_POWER, 1, 0,
+ adc_updown_depop, SND_SOC_DAPM_PRE_POST_PMU |
+ SND_SOC_DAPM_PRE_POST_PMD),
+ SND_SOC_DAPM_DAC_E("DAC", "Playback", SGTL5000_CHIP_ANA_POWER, 3, 0,
+ dac_updown_depop, SND_SOC_DAPM_PRE_POST_PMU |
+ SND_SOC_DAPM_PRE_POST_PMD),
};
/* routes for sgtl5000 */
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b1f373a11d25fc9a5f7679c9b85799fe09b0dc4a Mon Sep 17 00:00:00 2001
From: Oleksandr Suvorov <oleksandr.suvorov(a)toradex.com>
Date: Fri, 19 Jul 2019 10:05:31 +0000
Subject: [PATCH] ASoC: sgtl5000: Improve VAG power and mute control
VAG power control is improved to fit the manual [1]. This patch fixes as
minimum one bug: if customer muxes Headphone to Line-In right after boot,
the VAG power remains off that leads to poor sound quality from line-in.
I.e. after boot:
- Connect sound source to Line-In jack;
- Connect headphone to HP jack;
- Run following commands:
$ amixer set 'Headphone' 80%
$ amixer set 'Headphone Mux' LINE_IN
Change VAG power on/off control according to the following algorithm:
- turn VAG power ON on the 1st incoming event.
- keep it ON if there is any active VAG consumer (ADC/DAC/HP/Line-In).
- turn VAG power OFF when there is the latest consumer's pre-down event
come.
- always delay after VAG power OFF to avoid pop.
- delay after VAG power ON if the initiative consumer is Line-In, this
prevents pop during line-in muxing.
According to the data sheet [1], to avoid any pops/clicks,
the outputs should be muted during input/output
routing changes.
[1] https://www.nxp.com/docs/en/data-sheet/SGTL5000.pdf
Cc: stable(a)vger.kernel.org
Fixes: 9b34e6cc3bc2 ("ASoC: Add Freescale SGTL5000 codec support")
Signed-off-by: Oleksandr Suvorov <oleksandr.suvorov(a)toradex.com>
Reviewed-by: Marcel Ziswiler <marcel.ziswiler(a)toradex.com>
Reviewed-by: Fabio Estevam <festevam(a)gmail.com>
Reviewed-by: Cezary Rojewski <cezary.rojewski(a)intel.com>
Link: https://lore.kernel.org/r/20190719100524.23300-3-oleksandr.suvorov@toradex.…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
diff --git a/sound/soc/codecs/sgtl5000.c b/sound/soc/codecs/sgtl5000.c
index a6a4748c97f9..34cc85e49003 100644
--- a/sound/soc/codecs/sgtl5000.c
+++ b/sound/soc/codecs/sgtl5000.c
@@ -31,6 +31,13 @@
#define SGTL5000_DAP_REG_OFFSET 0x0100
#define SGTL5000_MAX_REG_OFFSET 0x013A
+/* Delay for the VAG ramp up */
+#define SGTL5000_VAG_POWERUP_DELAY 500 /* ms */
+/* Delay for the VAG ramp down */
+#define SGTL5000_VAG_POWERDOWN_DELAY 500 /* ms */
+
+#define SGTL5000_OUTPUTS_MUTE (SGTL5000_HP_MUTE | SGTL5000_LINE_OUT_MUTE)
+
/* default value of sgtl5000 registers */
static const struct reg_default sgtl5000_reg_defaults[] = {
{ SGTL5000_CHIP_DIG_POWER, 0x0000 },
@@ -123,6 +130,13 @@ enum {
I2S_SCLK_STRENGTH_HIGH,
};
+enum {
+ HP_POWER_EVENT,
+ DAC_POWER_EVENT,
+ ADC_POWER_EVENT,
+ LAST_POWER_EVENT = ADC_POWER_EVENT
+};
+
/* sgtl5000 private structure in codec */
struct sgtl5000_priv {
int sysclk; /* sysclk rate */
@@ -137,8 +151,109 @@ struct sgtl5000_priv {
u8 micbias_voltage;
u8 lrclk_strength;
u8 sclk_strength;
+ u16 mute_state[LAST_POWER_EVENT + 1];
};
+static inline int hp_sel_input(struct snd_soc_component *component)
+{
+ return (snd_soc_component_read32(component, SGTL5000_CHIP_ANA_CTRL) &
+ SGTL5000_HP_SEL_MASK) >> SGTL5000_HP_SEL_SHIFT;
+}
+
+static inline u16 mute_output(struct snd_soc_component *component,
+ u16 mute_mask)
+{
+ u16 mute_reg = snd_soc_component_read32(component,
+ SGTL5000_CHIP_ANA_CTRL);
+
+ snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_CTRL,
+ mute_mask, mute_mask);
+ return mute_reg;
+}
+
+static inline void restore_output(struct snd_soc_component *component,
+ u16 mute_mask, u16 mute_reg)
+{
+ snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_CTRL,
+ mute_mask, mute_reg);
+}
+
+static void vag_power_on(struct snd_soc_component *component, u32 source)
+{
+ if (snd_soc_component_read32(component, SGTL5000_CHIP_ANA_POWER) &
+ SGTL5000_VAG_POWERUP)
+ return;
+
+ snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_POWER,
+ SGTL5000_VAG_POWERUP, SGTL5000_VAG_POWERUP);
+
+ /* When VAG powering on to get local loop from Line-In, the sleep
+ * is required to avoid loud pop.
+ */
+ if (hp_sel_input(component) == SGTL5000_HP_SEL_LINE_IN &&
+ source == HP_POWER_EVENT)
+ msleep(SGTL5000_VAG_POWERUP_DELAY);
+}
+
+static int vag_power_consumers(struct snd_soc_component *component,
+ u16 ana_pwr_reg, u32 source)
+{
+ int consumers = 0;
+
+ /* count dac/adc consumers unconditional */
+ if (ana_pwr_reg & SGTL5000_DAC_POWERUP)
+ consumers++;
+ if (ana_pwr_reg & SGTL5000_ADC_POWERUP)
+ consumers++;
+
+ /*
+ * If the event comes from HP and Line-In is selected,
+ * current action is 'DAC to be powered down'.
+ * As HP_POWERUP is not set when HP muxed to line-in,
+ * we need to keep VAG power ON.
+ */
+ if (source == HP_POWER_EVENT) {
+ if (hp_sel_input(component) == SGTL5000_HP_SEL_LINE_IN)
+ consumers++;
+ } else {
+ if (ana_pwr_reg & SGTL5000_HP_POWERUP)
+ consumers++;
+ }
+
+ return consumers;
+}
+
+static void vag_power_off(struct snd_soc_component *component, u32 source)
+{
+ u16 ana_pwr = snd_soc_component_read32(component,
+ SGTL5000_CHIP_ANA_POWER);
+
+ if (!(ana_pwr & SGTL5000_VAG_POWERUP))
+ return;
+
+ /*
+ * This function calls when any of VAG power consumers is disappearing.
+ * Thus, if there is more than one consumer at the moment, as minimum
+ * one consumer will definitely stay after the end of the current
+ * event.
+ * Don't clear VAG_POWERUP if 2 or more consumers of VAG present:
+ * - LINE_IN (for HP events) / HP (for DAC/ADC events)
+ * - DAC
+ * - ADC
+ * (the current consumer is disappearing right now)
+ */
+ if (vag_power_consumers(component, ana_pwr, source) >= 2)
+ return;
+
+ snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_POWER,
+ SGTL5000_VAG_POWERUP, 0);
+ /* In power down case, we need wait 400-1000 ms
+ * when VAG fully ramped down.
+ * As longer we wait, as smaller pop we've got.
+ */
+ msleep(SGTL5000_VAG_POWERDOWN_DELAY);
+}
+
/*
* mic_bias power on/off share the same register bits with
* output impedance of mic bias, when power on mic bias, we
@@ -170,36 +285,46 @@ static int mic_bias_event(struct snd_soc_dapm_widget *w,
return 0;
}
-/*
- * As manual described, ADC/DAC only works when VAG powerup,
- * So enabled VAG before ADC/DAC up.
- * In power down case, we need wait 400ms when vag fully ramped down.
- */
-static int power_vag_event(struct snd_soc_dapm_widget *w,
- struct snd_kcontrol *kcontrol, int event)
+static int vag_and_mute_control(struct snd_soc_component *component,
+ int event, int event_source)
{
- struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm);
- const u32 mask = SGTL5000_DAC_POWERUP | SGTL5000_ADC_POWERUP;
+ static const u16 mute_mask[] = {
+ /*
+ * Mask for HP_POWER_EVENT.
+ * Muxing Headphones have to be wrapped with mute/unmute
+ * headphones only.
+ */
+ SGTL5000_HP_MUTE,
+ /*
+ * Masks for DAC_POWER_EVENT/ADC_POWER_EVENT.
+ * Muxing DAC or ADC block have to wrapped with mute/unmute
+ * both headphones and line-out.
+ */
+ SGTL5000_OUTPUTS_MUTE,
+ SGTL5000_OUTPUTS_MUTE
+ };
+
+ struct sgtl5000_priv *sgtl5000 =
+ snd_soc_component_get_drvdata(component);
switch (event) {
+ case SND_SOC_DAPM_PRE_PMU:
+ sgtl5000->mute_state[event_source] =
+ mute_output(component, mute_mask[event_source]);
+ break;
case SND_SOC_DAPM_POST_PMU:
- snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_POWER,
- SGTL5000_VAG_POWERUP, SGTL5000_VAG_POWERUP);
- msleep(400);
+ vag_power_on(component, event_source);
+ restore_output(component, mute_mask[event_source],
+ sgtl5000->mute_state[event_source]);
break;
-
case SND_SOC_DAPM_PRE_PMD:
- /*
- * Don't clear VAG_POWERUP, when both DAC and ADC are
- * operational to prevent inadvertently starving the
- * other one of them.
- */
- if ((snd_soc_component_read32(component, SGTL5000_CHIP_ANA_POWER) &
- mask) != mask) {
- snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_POWER,
- SGTL5000_VAG_POWERUP, 0);
- msleep(400);
- }
+ sgtl5000->mute_state[event_source] =
+ mute_output(component, mute_mask[event_source]);
+ vag_power_off(component, event_source);
+ break;
+ case SND_SOC_DAPM_POST_PMD:
+ restore_output(component, mute_mask[event_source],
+ sgtl5000->mute_state[event_source]);
break;
default:
break;
@@ -208,6 +333,41 @@ static int power_vag_event(struct snd_soc_dapm_widget *w,
return 0;
}
+/*
+ * Mute Headphone when power it up/down.
+ * Control VAG power on HP power path.
+ */
+static int headphone_pga_event(struct snd_soc_dapm_widget *w,
+ struct snd_kcontrol *kcontrol, int event)
+{
+ struct snd_soc_component *component =
+ snd_soc_dapm_to_component(w->dapm);
+
+ return vag_and_mute_control(component, event, HP_POWER_EVENT);
+}
+
+/* As manual describes, ADC/DAC powering up/down requires
+ * to mute outputs to avoid pops.
+ * Control VAG power on ADC/DAC power path.
+ */
+static int adc_updown_depop(struct snd_soc_dapm_widget *w,
+ struct snd_kcontrol *kcontrol, int event)
+{
+ struct snd_soc_component *component =
+ snd_soc_dapm_to_component(w->dapm);
+
+ return vag_and_mute_control(component, event, ADC_POWER_EVENT);
+}
+
+static int dac_updown_depop(struct snd_soc_dapm_widget *w,
+ struct snd_kcontrol *kcontrol, int event)
+{
+ struct snd_soc_component *component =
+ snd_soc_dapm_to_component(w->dapm);
+
+ return vag_and_mute_control(component, event, DAC_POWER_EVENT);
+}
+
/* input sources for ADC */
static const char *adc_mux_text[] = {
"MIC_IN", "LINE_IN"
@@ -280,7 +440,10 @@ static const struct snd_soc_dapm_widget sgtl5000_dapm_widgets[] = {
mic_bias_event,
SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD),
- SND_SOC_DAPM_PGA("HP", SGTL5000_CHIP_ANA_POWER, 4, 0, NULL, 0),
+ SND_SOC_DAPM_PGA_E("HP", SGTL5000_CHIP_ANA_POWER, 4, 0, NULL, 0,
+ headphone_pga_event,
+ SND_SOC_DAPM_PRE_POST_PMU |
+ SND_SOC_DAPM_PRE_POST_PMD),
SND_SOC_DAPM_PGA("LO", SGTL5000_CHIP_ANA_POWER, 0, 0, NULL, 0),
SND_SOC_DAPM_MUX("Capture Mux", SND_SOC_NOPM, 0, 0, &adc_mux),
@@ -301,11 +464,12 @@ static const struct snd_soc_dapm_widget sgtl5000_dapm_widgets[] = {
0, SGTL5000_CHIP_DIG_POWER,
1, 0),
- SND_SOC_DAPM_ADC("ADC", "Capture", SGTL5000_CHIP_ANA_POWER, 1, 0),
- SND_SOC_DAPM_DAC("DAC", "Playback", SGTL5000_CHIP_ANA_POWER, 3, 0),
-
- SND_SOC_DAPM_PRE("VAG_POWER_PRE", power_vag_event),
- SND_SOC_DAPM_POST("VAG_POWER_POST", power_vag_event),
+ SND_SOC_DAPM_ADC_E("ADC", "Capture", SGTL5000_CHIP_ANA_POWER, 1, 0,
+ adc_updown_depop, SND_SOC_DAPM_PRE_POST_PMU |
+ SND_SOC_DAPM_PRE_POST_PMD),
+ SND_SOC_DAPM_DAC_E("DAC", "Playback", SGTL5000_CHIP_ANA_POWER, 3, 0,
+ dac_updown_depop, SND_SOC_DAPM_PRE_POST_PMU |
+ SND_SOC_DAPM_PRE_POST_PMD),
};
/* routes for sgtl5000 */
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b1f373a11d25fc9a5f7679c9b85799fe09b0dc4a Mon Sep 17 00:00:00 2001
From: Oleksandr Suvorov <oleksandr.suvorov(a)toradex.com>
Date: Fri, 19 Jul 2019 10:05:31 +0000
Subject: [PATCH] ASoC: sgtl5000: Improve VAG power and mute control
VAG power control is improved to fit the manual [1]. This patch fixes as
minimum one bug: if customer muxes Headphone to Line-In right after boot,
the VAG power remains off that leads to poor sound quality from line-in.
I.e. after boot:
- Connect sound source to Line-In jack;
- Connect headphone to HP jack;
- Run following commands:
$ amixer set 'Headphone' 80%
$ amixer set 'Headphone Mux' LINE_IN
Change VAG power on/off control according to the following algorithm:
- turn VAG power ON on the 1st incoming event.
- keep it ON if there is any active VAG consumer (ADC/DAC/HP/Line-In).
- turn VAG power OFF when there is the latest consumer's pre-down event
come.
- always delay after VAG power OFF to avoid pop.
- delay after VAG power ON if the initiative consumer is Line-In, this
prevents pop during line-in muxing.
According to the data sheet [1], to avoid any pops/clicks,
the outputs should be muted during input/output
routing changes.
[1] https://www.nxp.com/docs/en/data-sheet/SGTL5000.pdf
Cc: stable(a)vger.kernel.org
Fixes: 9b34e6cc3bc2 ("ASoC: Add Freescale SGTL5000 codec support")
Signed-off-by: Oleksandr Suvorov <oleksandr.suvorov(a)toradex.com>
Reviewed-by: Marcel Ziswiler <marcel.ziswiler(a)toradex.com>
Reviewed-by: Fabio Estevam <festevam(a)gmail.com>
Reviewed-by: Cezary Rojewski <cezary.rojewski(a)intel.com>
Link: https://lore.kernel.org/r/20190719100524.23300-3-oleksandr.suvorov@toradex.…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
diff --git a/sound/soc/codecs/sgtl5000.c b/sound/soc/codecs/sgtl5000.c
index a6a4748c97f9..34cc85e49003 100644
--- a/sound/soc/codecs/sgtl5000.c
+++ b/sound/soc/codecs/sgtl5000.c
@@ -31,6 +31,13 @@
#define SGTL5000_DAP_REG_OFFSET 0x0100
#define SGTL5000_MAX_REG_OFFSET 0x013A
+/* Delay for the VAG ramp up */
+#define SGTL5000_VAG_POWERUP_DELAY 500 /* ms */
+/* Delay for the VAG ramp down */
+#define SGTL5000_VAG_POWERDOWN_DELAY 500 /* ms */
+
+#define SGTL5000_OUTPUTS_MUTE (SGTL5000_HP_MUTE | SGTL5000_LINE_OUT_MUTE)
+
/* default value of sgtl5000 registers */
static const struct reg_default sgtl5000_reg_defaults[] = {
{ SGTL5000_CHIP_DIG_POWER, 0x0000 },
@@ -123,6 +130,13 @@ enum {
I2S_SCLK_STRENGTH_HIGH,
};
+enum {
+ HP_POWER_EVENT,
+ DAC_POWER_EVENT,
+ ADC_POWER_EVENT,
+ LAST_POWER_EVENT = ADC_POWER_EVENT
+};
+
/* sgtl5000 private structure in codec */
struct sgtl5000_priv {
int sysclk; /* sysclk rate */
@@ -137,8 +151,109 @@ struct sgtl5000_priv {
u8 micbias_voltage;
u8 lrclk_strength;
u8 sclk_strength;
+ u16 mute_state[LAST_POWER_EVENT + 1];
};
+static inline int hp_sel_input(struct snd_soc_component *component)
+{
+ return (snd_soc_component_read32(component, SGTL5000_CHIP_ANA_CTRL) &
+ SGTL5000_HP_SEL_MASK) >> SGTL5000_HP_SEL_SHIFT;
+}
+
+static inline u16 mute_output(struct snd_soc_component *component,
+ u16 mute_mask)
+{
+ u16 mute_reg = snd_soc_component_read32(component,
+ SGTL5000_CHIP_ANA_CTRL);
+
+ snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_CTRL,
+ mute_mask, mute_mask);
+ return mute_reg;
+}
+
+static inline void restore_output(struct snd_soc_component *component,
+ u16 mute_mask, u16 mute_reg)
+{
+ snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_CTRL,
+ mute_mask, mute_reg);
+}
+
+static void vag_power_on(struct snd_soc_component *component, u32 source)
+{
+ if (snd_soc_component_read32(component, SGTL5000_CHIP_ANA_POWER) &
+ SGTL5000_VAG_POWERUP)
+ return;
+
+ snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_POWER,
+ SGTL5000_VAG_POWERUP, SGTL5000_VAG_POWERUP);
+
+ /* When VAG powering on to get local loop from Line-In, the sleep
+ * is required to avoid loud pop.
+ */
+ if (hp_sel_input(component) == SGTL5000_HP_SEL_LINE_IN &&
+ source == HP_POWER_EVENT)
+ msleep(SGTL5000_VAG_POWERUP_DELAY);
+}
+
+static int vag_power_consumers(struct snd_soc_component *component,
+ u16 ana_pwr_reg, u32 source)
+{
+ int consumers = 0;
+
+ /* count dac/adc consumers unconditional */
+ if (ana_pwr_reg & SGTL5000_DAC_POWERUP)
+ consumers++;
+ if (ana_pwr_reg & SGTL5000_ADC_POWERUP)
+ consumers++;
+
+ /*
+ * If the event comes from HP and Line-In is selected,
+ * current action is 'DAC to be powered down'.
+ * As HP_POWERUP is not set when HP muxed to line-in,
+ * we need to keep VAG power ON.
+ */
+ if (source == HP_POWER_EVENT) {
+ if (hp_sel_input(component) == SGTL5000_HP_SEL_LINE_IN)
+ consumers++;
+ } else {
+ if (ana_pwr_reg & SGTL5000_HP_POWERUP)
+ consumers++;
+ }
+
+ return consumers;
+}
+
+static void vag_power_off(struct snd_soc_component *component, u32 source)
+{
+ u16 ana_pwr = snd_soc_component_read32(component,
+ SGTL5000_CHIP_ANA_POWER);
+
+ if (!(ana_pwr & SGTL5000_VAG_POWERUP))
+ return;
+
+ /*
+ * This function calls when any of VAG power consumers is disappearing.
+ * Thus, if there is more than one consumer at the moment, as minimum
+ * one consumer will definitely stay after the end of the current
+ * event.
+ * Don't clear VAG_POWERUP if 2 or more consumers of VAG present:
+ * - LINE_IN (for HP events) / HP (for DAC/ADC events)
+ * - DAC
+ * - ADC
+ * (the current consumer is disappearing right now)
+ */
+ if (vag_power_consumers(component, ana_pwr, source) >= 2)
+ return;
+
+ snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_POWER,
+ SGTL5000_VAG_POWERUP, 0);
+ /* In power down case, we need wait 400-1000 ms
+ * when VAG fully ramped down.
+ * As longer we wait, as smaller pop we've got.
+ */
+ msleep(SGTL5000_VAG_POWERDOWN_DELAY);
+}
+
/*
* mic_bias power on/off share the same register bits with
* output impedance of mic bias, when power on mic bias, we
@@ -170,36 +285,46 @@ static int mic_bias_event(struct snd_soc_dapm_widget *w,
return 0;
}
-/*
- * As manual described, ADC/DAC only works when VAG powerup,
- * So enabled VAG before ADC/DAC up.
- * In power down case, we need wait 400ms when vag fully ramped down.
- */
-static int power_vag_event(struct snd_soc_dapm_widget *w,
- struct snd_kcontrol *kcontrol, int event)
+static int vag_and_mute_control(struct snd_soc_component *component,
+ int event, int event_source)
{
- struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm);
- const u32 mask = SGTL5000_DAC_POWERUP | SGTL5000_ADC_POWERUP;
+ static const u16 mute_mask[] = {
+ /*
+ * Mask for HP_POWER_EVENT.
+ * Muxing Headphones have to be wrapped with mute/unmute
+ * headphones only.
+ */
+ SGTL5000_HP_MUTE,
+ /*
+ * Masks for DAC_POWER_EVENT/ADC_POWER_EVENT.
+ * Muxing DAC or ADC block have to wrapped with mute/unmute
+ * both headphones and line-out.
+ */
+ SGTL5000_OUTPUTS_MUTE,
+ SGTL5000_OUTPUTS_MUTE
+ };
+
+ struct sgtl5000_priv *sgtl5000 =
+ snd_soc_component_get_drvdata(component);
switch (event) {
+ case SND_SOC_DAPM_PRE_PMU:
+ sgtl5000->mute_state[event_source] =
+ mute_output(component, mute_mask[event_source]);
+ break;
case SND_SOC_DAPM_POST_PMU:
- snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_POWER,
- SGTL5000_VAG_POWERUP, SGTL5000_VAG_POWERUP);
- msleep(400);
+ vag_power_on(component, event_source);
+ restore_output(component, mute_mask[event_source],
+ sgtl5000->mute_state[event_source]);
break;
-
case SND_SOC_DAPM_PRE_PMD:
- /*
- * Don't clear VAG_POWERUP, when both DAC and ADC are
- * operational to prevent inadvertently starving the
- * other one of them.
- */
- if ((snd_soc_component_read32(component, SGTL5000_CHIP_ANA_POWER) &
- mask) != mask) {
- snd_soc_component_update_bits(component, SGTL5000_CHIP_ANA_POWER,
- SGTL5000_VAG_POWERUP, 0);
- msleep(400);
- }
+ sgtl5000->mute_state[event_source] =
+ mute_output(component, mute_mask[event_source]);
+ vag_power_off(component, event_source);
+ break;
+ case SND_SOC_DAPM_POST_PMD:
+ restore_output(component, mute_mask[event_source],
+ sgtl5000->mute_state[event_source]);
break;
default:
break;
@@ -208,6 +333,41 @@ static int power_vag_event(struct snd_soc_dapm_widget *w,
return 0;
}
+/*
+ * Mute Headphone when power it up/down.
+ * Control VAG power on HP power path.
+ */
+static int headphone_pga_event(struct snd_soc_dapm_widget *w,
+ struct snd_kcontrol *kcontrol, int event)
+{
+ struct snd_soc_component *component =
+ snd_soc_dapm_to_component(w->dapm);
+
+ return vag_and_mute_control(component, event, HP_POWER_EVENT);
+}
+
+/* As manual describes, ADC/DAC powering up/down requires
+ * to mute outputs to avoid pops.
+ * Control VAG power on ADC/DAC power path.
+ */
+static int adc_updown_depop(struct snd_soc_dapm_widget *w,
+ struct snd_kcontrol *kcontrol, int event)
+{
+ struct snd_soc_component *component =
+ snd_soc_dapm_to_component(w->dapm);
+
+ return vag_and_mute_control(component, event, ADC_POWER_EVENT);
+}
+
+static int dac_updown_depop(struct snd_soc_dapm_widget *w,
+ struct snd_kcontrol *kcontrol, int event)
+{
+ struct snd_soc_component *component =
+ snd_soc_dapm_to_component(w->dapm);
+
+ return vag_and_mute_control(component, event, DAC_POWER_EVENT);
+}
+
/* input sources for ADC */
static const char *adc_mux_text[] = {
"MIC_IN", "LINE_IN"
@@ -280,7 +440,10 @@ static const struct snd_soc_dapm_widget sgtl5000_dapm_widgets[] = {
mic_bias_event,
SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD),
- SND_SOC_DAPM_PGA("HP", SGTL5000_CHIP_ANA_POWER, 4, 0, NULL, 0),
+ SND_SOC_DAPM_PGA_E("HP", SGTL5000_CHIP_ANA_POWER, 4, 0, NULL, 0,
+ headphone_pga_event,
+ SND_SOC_DAPM_PRE_POST_PMU |
+ SND_SOC_DAPM_PRE_POST_PMD),
SND_SOC_DAPM_PGA("LO", SGTL5000_CHIP_ANA_POWER, 0, 0, NULL, 0),
SND_SOC_DAPM_MUX("Capture Mux", SND_SOC_NOPM, 0, 0, &adc_mux),
@@ -301,11 +464,12 @@ static const struct snd_soc_dapm_widget sgtl5000_dapm_widgets[] = {
0, SGTL5000_CHIP_DIG_POWER,
1, 0),
- SND_SOC_DAPM_ADC("ADC", "Capture", SGTL5000_CHIP_ANA_POWER, 1, 0),
- SND_SOC_DAPM_DAC("DAC", "Playback", SGTL5000_CHIP_ANA_POWER, 3, 0),
-
- SND_SOC_DAPM_PRE("VAG_POWER_PRE", power_vag_event),
- SND_SOC_DAPM_POST("VAG_POWER_POST", power_vag_event),
+ SND_SOC_DAPM_ADC_E("ADC", "Capture", SGTL5000_CHIP_ANA_POWER, 1, 0,
+ adc_updown_depop, SND_SOC_DAPM_PRE_POST_PMU |
+ SND_SOC_DAPM_PRE_POST_PMD),
+ SND_SOC_DAPM_DAC_E("DAC", "Playback", SGTL5000_CHIP_ANA_POWER, 3, 0,
+ dac_updown_depop, SND_SOC_DAPM_PRE_POST_PMU |
+ SND_SOC_DAPM_PRE_POST_PMD),
};
/* routes for sgtl5000 */
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 567926cca99ba1750be8aae9c4178796bf9bb90b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson(a)intel.com>
Date: Tue, 1 Oct 2019 09:21:23 -0700
Subject: [PATCH] KVM: nVMX: Fix consistency check on injected exception error
code
Current versions of Intel's SDM incorrectly state that "bits 31:15 of
the VM-Entry exception error-code field" must be zero. In reality, bits
31:16 must be zero, i.e. error codes are 16-bit values.
The bogus error code check manifests as an unexpected VM-Entry failure
due to an invalid code field (error number 7) in L1, e.g. when injecting
a #GP with error_code=0x9f00.
Nadav previously reported the bug[*], both to KVM and Intel, and fixed
the associated kvm-unit-test.
[*] https://patchwork.kernel.org/patch/11124749/
Reported-by: Nadav Amit <namit(a)vmware.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <sean.j.christopherson(a)intel.com>
Reviewed-by: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 41abc62c9a8a..e76eb4f07f6c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2610,7 +2610,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
/* VM-entry exception error code */
if (CC(has_error_code &&
- vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)))
+ vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
return -EINVAL;
/* VM-entry interruption-info field: reserved bits */
The patch below does not apply to the 5.3-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 567926cca99ba1750be8aae9c4178796bf9bb90b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson(a)intel.com>
Date: Tue, 1 Oct 2019 09:21:23 -0700
Subject: [PATCH] KVM: nVMX: Fix consistency check on injected exception error
code
Current versions of Intel's SDM incorrectly state that "bits 31:15 of
the VM-Entry exception error-code field" must be zero. In reality, bits
31:16 must be zero, i.e. error codes are 16-bit values.
The bogus error code check manifests as an unexpected VM-Entry failure
due to an invalid code field (error number 7) in L1, e.g. when injecting
a #GP with error_code=0x9f00.
Nadav previously reported the bug[*], both to KVM and Intel, and fixed
the associated kvm-unit-test.
[*] https://patchwork.kernel.org/patch/11124749/
Reported-by: Nadav Amit <namit(a)vmware.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <sean.j.christopherson(a)intel.com>
Reviewed-by: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 41abc62c9a8a..e76eb4f07f6c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2610,7 +2610,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
/* VM-entry exception error code */
if (CC(has_error_code &&
- vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)))
+ vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
return -EINVAL;
/* VM-entry interruption-info field: reserved bits */
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 567926cca99ba1750be8aae9c4178796bf9bb90b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson(a)intel.com>
Date: Tue, 1 Oct 2019 09:21:23 -0700
Subject: [PATCH] KVM: nVMX: Fix consistency check on injected exception error
code
Current versions of Intel's SDM incorrectly state that "bits 31:15 of
the VM-Entry exception error-code field" must be zero. In reality, bits
31:16 must be zero, i.e. error codes are 16-bit values.
The bogus error code check manifests as an unexpected VM-Entry failure
due to an invalid code field (error number 7) in L1, e.g. when injecting
a #GP with error_code=0x9f00.
Nadav previously reported the bug[*], both to KVM and Intel, and fixed
the associated kvm-unit-test.
[*] https://patchwork.kernel.org/patch/11124749/
Reported-by: Nadav Amit <namit(a)vmware.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <sean.j.christopherson(a)intel.com>
Reviewed-by: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 41abc62c9a8a..e76eb4f07f6c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2610,7 +2610,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
/* VM-entry exception error code */
if (CC(has_error_code &&
- vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)))
+ vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
return -EINVAL;
/* VM-entry interruption-info field: reserved bits */
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From d28eafc5a64045c78136162af9d4ba42f8230080 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus(a)ozlabs.org>
Date: Tue, 27 Aug 2019 11:31:37 +1000
Subject: [PATCH] KVM: PPC: Book3S HV: Check for MMU ready on piggybacked
virtual cores
When we are running multiple vcores on the same physical core, they
could be from different VMs and so it is possible that one of the
VMs could have its arch.mmu_ready flag cleared (for example by a
concurrent HPT resize) when we go to run it on a physical core.
We currently check the arch.mmu_ready flag for the primary vcore
but not the flags for the other vcores that will be run alongside
it. This adds that check, and also a check when we select the
secondary vcores from the preempted vcores list.
Cc: stable(a)vger.kernel.org # v4.14+
Fixes: 38c53af85306 ("KVM: PPC: Book3S HV: Fix exclusion between HPT resizing and other HPT updates")
Signed-off-by: Paul Mackerras <paulus(a)ozlabs.org>
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index cde3f5a4b3e4..36d72e9faddf 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2860,7 +2860,7 @@ static void collect_piggybacks(struct core_info *cip, int target_threads)
if (!spin_trylock(&pvc->lock))
continue;
prepare_threads(pvc);
- if (!pvc->n_runnable) {
+ if (!pvc->n_runnable || !pvc->kvm->arch.mmu_ready) {
list_del_init(&pvc->preempt_list);
if (pvc->runner == NULL) {
pvc->vcore_state = VCORE_INACTIVE;
@@ -2881,15 +2881,20 @@ static void collect_piggybacks(struct core_info *cip, int target_threads)
spin_unlock(&lp->lock);
}
-static bool recheck_signals(struct core_info *cip)
+static bool recheck_signals_and_mmu(struct core_info *cip)
{
int sub, i;
struct kvm_vcpu *vcpu;
+ struct kvmppc_vcore *vc;
- for (sub = 0; sub < cip->n_subcores; ++sub)
- for_each_runnable_thread(i, vcpu, cip->vc[sub])
+ for (sub = 0; sub < cip->n_subcores; ++sub) {
+ vc = cip->vc[sub];
+ if (!vc->kvm->arch.mmu_ready)
+ return true;
+ for_each_runnable_thread(i, vcpu, vc)
if (signal_pending(vcpu->arch.run_task))
return true;
+ }
return false;
}
@@ -3119,7 +3124,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
local_irq_disable();
hard_irq_disable();
if (lazy_irq_pending() || need_resched() ||
- recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) {
+ recheck_signals_and_mmu(&core_info)) {
local_irq_enable();
vc->vcore_state = VCORE_INACTIVE;
/* Unlock all except the primary vcore */
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 8d4ba9c931bc384bcc6889a43915aaaf19d3e499 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus(a)ozlabs.org>
Date: Tue, 13 Aug 2019 20:01:00 +1000
Subject: [PATCH] KVM: PPC: Book3S HV: Don't push XIVE context when not using
XIVE device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
At present, when running a guest on POWER9 using HV KVM but not using
an in-kernel interrupt controller (XICS or XIVE), for example if QEMU
is run with the kernel_irqchip=off option, the guest entry code goes
ahead and tries to load the guest context into the XIVE hardware, even
though no context has been set up.
To fix this, we check that the "CAM word" is non-zero before pushing
it to the hardware. The CAM word is initialized to a non-zero value
in kvmppc_xive_connect_vcpu() and kvmppc_xive_native_connect_vcpu(),
and is now cleared in kvmppc_xive_{,native_}cleanup_vcpu.
Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller")
Cc: stable(a)vger.kernel.org # v4.12+
Reported-by: Cédric Le Goater <clg(a)kaod.org>
Signed-off-by: Paul Mackerras <paulus(a)ozlabs.org>
Reviewed-by: Cédric Le Goater <clg(a)kaod.org>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20190813100100.GC9567@blackberry
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 2e7e788eb0cf..07181d0dfcb7 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -942,6 +942,8 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
ld r11, VCPU_XIVE_SAVED_STATE(r4)
li r9, TM_QW1_OS
lwz r8, VCPU_XIVE_CAM_WORD(r4)
+ cmpwi r8, 0
+ beq no_xive
li r7, TM_QW1_OS + TM_WORD2
mfmsr r0
andi. r0, r0, MSR_DR /* in real mode? */
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 09f838aa3138..586867e46e51 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -67,8 +67,14 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
u64 pq;
- if (!tima)
+ /*
+ * Nothing to do if the platform doesn't have a XIVE
+ * or this vCPU doesn't have its own XIVE context
+ * (e.g. because it's not using an in-kernel interrupt controller).
+ */
+ if (!tima || !vcpu->arch.xive_cam_word)
return;
+
eieio();
__raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
__raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2);
@@ -1146,6 +1152,9 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
/* Disable the VP */
xive_native_disable_vp(xc->vp_id);
+ /* Clear the cam word so guest entry won't try to push context */
+ vcpu->arch.xive_cam_word = 0;
+
/* Free the queues */
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
struct xive_q *q = &xc->queues[i];
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
index 368427fcad20..11b91b46fc39 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -81,6 +81,9 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
/* Disable the VP */
xive_native_disable_vp(xc->vp_id);
+ /* Clear the cam word so guest entry won't try to push context */
+ vcpu->arch.xive_cam_word = 0;
+
/* Free the queues */
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
kvmppc_xive_native_cleanup_queue(vcpu, i);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 8d4ba9c931bc384bcc6889a43915aaaf19d3e499 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus(a)ozlabs.org>
Date: Tue, 13 Aug 2019 20:01:00 +1000
Subject: [PATCH] KVM: PPC: Book3S HV: Don't push XIVE context when not using
XIVE device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
At present, when running a guest on POWER9 using HV KVM but not using
an in-kernel interrupt controller (XICS or XIVE), for example if QEMU
is run with the kernel_irqchip=off option, the guest entry code goes
ahead and tries to load the guest context into the XIVE hardware, even
though no context has been set up.
To fix this, we check that the "CAM word" is non-zero before pushing
it to the hardware. The CAM word is initialized to a non-zero value
in kvmppc_xive_connect_vcpu() and kvmppc_xive_native_connect_vcpu(),
and is now cleared in kvmppc_xive_{,native_}cleanup_vcpu.
Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller")
Cc: stable(a)vger.kernel.org # v4.12+
Reported-by: Cédric Le Goater <clg(a)kaod.org>
Signed-off-by: Paul Mackerras <paulus(a)ozlabs.org>
Reviewed-by: Cédric Le Goater <clg(a)kaod.org>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20190813100100.GC9567@blackberry
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 2e7e788eb0cf..07181d0dfcb7 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -942,6 +942,8 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
ld r11, VCPU_XIVE_SAVED_STATE(r4)
li r9, TM_QW1_OS
lwz r8, VCPU_XIVE_CAM_WORD(r4)
+ cmpwi r8, 0
+ beq no_xive
li r7, TM_QW1_OS + TM_WORD2
mfmsr r0
andi. r0, r0, MSR_DR /* in real mode? */
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 09f838aa3138..586867e46e51 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -67,8 +67,14 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
u64 pq;
- if (!tima)
+ /*
+ * Nothing to do if the platform doesn't have a XIVE
+ * or this vCPU doesn't have its own XIVE context
+ * (e.g. because it's not using an in-kernel interrupt controller).
+ */
+ if (!tima || !vcpu->arch.xive_cam_word)
return;
+
eieio();
__raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
__raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2);
@@ -1146,6 +1152,9 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
/* Disable the VP */
xive_native_disable_vp(xc->vp_id);
+ /* Clear the cam word so guest entry won't try to push context */
+ vcpu->arch.xive_cam_word = 0;
+
/* Free the queues */
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
struct xive_q *q = &xc->queues[i];
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
index 368427fcad20..11b91b46fc39 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -81,6 +81,9 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
/* Disable the VP */
xive_native_disable_vp(xc->vp_id);
+ /* Clear the cam word so guest entry won't try to push context */
+ vcpu->arch.xive_cam_word = 0;
+
/* Free the queues */
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
kvmppc_xive_native_cleanup_queue(vcpu, i);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 237aed48c642328ff0ab19b63423634340224a06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg(a)kaod.org>
Date: Tue, 6 Aug 2019 19:25:38 +0200
Subject: [PATCH] KVM: PPC: Book3S HV: XIVE: Free escalation interrupts before
disabling the VP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When a vCPU is brought done, the XIVE VP (Virtual Processor) is first
disabled and then the event notification queues are freed. When freeing
the queues, we check for possible escalation interrupts and free them
also.
But when a XIVE VP is disabled, the underlying XIVE ENDs also are
disabled in OPAL. When an END (Event Notification Descriptor) is
disabled, its ESB pages (ESn and ESe) are disabled and loads return all
1s. Which means that any access on the ESB page of the escalation
interrupt will return invalid values.
When an interrupt is freed, the shutdown handler computes a 'saved_p'
field from the value returned by a load in xive_do_source_set_mask().
This value is incorrect for escalation interrupts for the reason
described above.
This has no impact on Linux/KVM today because we don't make use of it
but we will introduce in future changes a xive_get_irqchip_state()
handler. This handler will use the 'saved_p' field to return the state
of an interrupt and 'saved_p' being incorrect, softlockup will occur.
Fix the vCPU cleanup sequence by first freeing the escalation interrupts
if any, then disable the XIVE VP and last free the queues.
Fixes: 90c73795afa2 ("KVM: PPC: Book3S HV: Add a new KVM device for the XIVE native exploitation mode")
Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller")
Cc: stable(a)vger.kernel.org # v4.12+
Signed-off-by: Cédric Le Goater <clg(a)kaod.org>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20190806172538.5087-1-clg@kaod.org
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index e3ba67095895..09f838aa3138 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -1134,20 +1134,22 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
/* Mask the VP IPI */
xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01);
- /* Disable the VP */
- xive_native_disable_vp(xc->vp_id);
-
- /* Free the queues & associated interrupts */
+ /* Free escalations */
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
- struct xive_q *q = &xc->queues[i];
-
- /* Free the escalation irq */
if (xc->esc_virq[i]) {
free_irq(xc->esc_virq[i], vcpu);
irq_dispose_mapping(xc->esc_virq[i]);
kfree(xc->esc_virq_names[i]);
}
- /* Free the queue */
+ }
+
+ /* Disable the VP */
+ xive_native_disable_vp(xc->vp_id);
+
+ /* Free the queues */
+ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+ struct xive_q *q = &xc->queues[i];
+
xive_native_disable_queue(xc->vp_id, q, i);
if (q->qpage) {
free_pages((unsigned long)q->qpage,
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
index a998823f68a3..368427fcad20 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -67,10 +67,7 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
xc->valid = false;
kvmppc_xive_disable_vcpu_interrupts(vcpu);
- /* Disable the VP */
- xive_native_disable_vp(xc->vp_id);
-
- /* Free the queues & associated interrupts */
+ /* Free escalations */
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
/* Free the escalation irq */
if (xc->esc_virq[i]) {
@@ -79,8 +76,13 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
kfree(xc->esc_virq_names[i]);
xc->esc_virq[i] = 0;
}
+ }
- /* Free the queue */
+ /* Disable the VP */
+ xive_native_disable_vp(xc->vp_id);
+
+ /* Free the queues */
+ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
kvmppc_xive_native_cleanup_queue(vcpu, i);
}
I'm announcing the release of the 4.4.196 kernel.
All users of the 4.4 kernel series must upgrade.
The updated 4.4.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-4.4.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2 -
arch/arm/mm/fault.c | 4 +-
arch/arm/mm/fault.h | 1
arch/powerpc/include/asm/futex.h | 3 -
arch/powerpc/kernel/exceptions-64s.S | 4 ++
arch/powerpc/kernel/rtas.c | 11 ++++--
arch/powerpc/platforms/pseries/mobility.c | 9 +++++
arch/powerpc/platforms/pseries/setup.c | 3 +
arch/s390/hypfs/inode.c | 9 ++---
drivers/android/binder.c | 26 ++++++++++++++-
drivers/char/ipmi/ipmi_si_intf.c | 24 +++++++++++---
drivers/clk/clk-qoriq.c | 2 -
drivers/clk/sirf/clk-common.c | 12 ++++---
drivers/gpu/drm/radeon/radeon_connectors.c | 2 -
drivers/hid/hid-apple.c | 49 ++++++++++++++++-------------
drivers/mfd/intel-lpss-pci.c | 2 +
drivers/net/ethernet/qlogic/qla3xxx.c | 1
drivers/net/usb/hso.c | 12 ++++---
drivers/net/xen-netfront.c | 17 +++++-----
drivers/pinctrl/pinctrl-tegra.c | 4 +-
drivers/scsi/scsi_logging.c | 48 +---------------------------
drivers/vfio/pci/vfio_pci.c | 17 +++++++---
drivers/video/fbdev/ssd1307fb.c | 2 -
fs/fat/dir.c | 13 ++++++-
fs/fat/fatent.c | 3 +
fs/ocfs2/dlm/dlmunlock.c | 23 +++++++++++--
include/scsi/scsi_dbg.h | 2 -
lib/Kconfig.debug | 2 -
net/ipv4/route.c | 5 +-
net/ipv6/ip6_input.c | 10 +++++
net/nfc/llcp_sock.c | 7 +++-
net/nfc/netlink.c | 6 ++-
net/rds/ib.c | 6 +--
net/sched/sch_cbq.c | 27 +++++++++++++--
net/sched/sch_dsmark.c | 2 +
security/smack/smack_access.c | 4 +-
security/smack/smack_lsm.c | 7 ++--
37 files changed, 246 insertions(+), 135 deletions(-)
Andrey Konovalov (1):
NFC: fix attrs checks in netlink interface
Bart Van Assche (1):
scsi: core: Reduce memory required for SCSI logging
Changwei Ge (1):
ocfs2: wait for recovering done after direct unlock request
Christophe Leroy (1):
powerpc/futex: Fix warning: 'oldval' may be used uninitialized in this function
Corey Minyard (1):
ipmi_si: Only schedule continuously in the thread in maintenance mode
David Howells (1):
hypfs: Fix error number left in struct pointer member
Dongli Zhang (1):
xen-netfront: do not use ~0U as error return value for xennet_fill_frags()
Dotan Barak (1):
net/rds: Fix error handling in rds_ib_add_one()
Eric Biggers (1):
smack: use GFP_NOFS while holding inode_smack::smk_lock
Eric Dumazet (4):
ipv6: drop incoming packets having a v4mapped source address
nfc: fix memory leak in llcp_sock_bind()
sch_dsmark: fix potential NULL deref in dsmark_init()
sch_cbq: validate TCA_CBQ_WRROPT to avoid crash
Greg Kroah-Hartman (1):
Linux 4.4.196
Jann Horn (1):
Smack: Don't ignore other bprm->unsafe flags if LSM_UNSAFE_PTRACE is set
Jia-Ju Bai (2):
gpu: drm: radeon: Fix a possible null-pointer dereference in radeon_connector_set_property()
security: smack: Fix possible null-pointer dereferences in smack_socket_sock_rcv_skb()
Joao Moreno (1):
HID: apple: Fix stuck function keys when using FN
Johan Hovold (1):
hso: fix NULL-deref on tty open
Kai-Heng Feng (1):
mfd: intel-lpss: Remove D3cold delay
Marko Kohtala (1):
video: ssd1307fb: Start page range at page_offset
Martijn Coenen (2):
ANDROID: binder: remove waitqueue when thread exits.
ANDROID: binder: synchronize_rcu() when using POLLFREE.
Nathan Huckleberry (1):
clk: qoriq: Fix -Wunused-const-variable
Nathan Lynch (3):
powerpc/rtas: use device model APIs and serialization during LPM
powerpc/pseries/mobility: use cond_resched when updating device tree
powerpc/pseries: correctly track irq state in default idle
Navid Emamdoost (1):
net: qlogic: Fix memory leak in ql_alloc_large_buffers
Nicholas Piggin (1):
powerpc/64s/exception: machine check use correct cfar for late handler
Nicolas Boichat (1):
kmemleak: increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE default to 16K
OGAWA Hirofumi (1):
fat: work around race with userspace's read via blockdev while mounting
Paolo Abeni (1):
net: ipv4: avoid mixed n_redirects and rate_tokens usage
Sowjanya Komatineni (1):
pinctrl: tegra: Fix write barrier placement in pmx_writel
Stephen Boyd (1):
clk: sirf: Don't reference clk_init_data after registration
Will Deacon (1):
ARM: 8898/1: mm: Don't treat faults reported from cache maintenance as writes
hexin (1):
vfio_pci: Restore original state on release
This is the start of the stable review cycle for the 4.4.196 release.
There are 36 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Tue 08 Oct 2019 05:07:10 PM UTC.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.4.196-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.4.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.4.196-rc1
Andrey Konovalov <andreyknvl(a)google.com>
NFC: fix attrs checks in netlink interface
Eric Biggers <ebiggers(a)google.com>
smack: use GFP_NOFS while holding inode_smack::smk_lock
Jann Horn <jannh(a)google.com>
Smack: Don't ignore other bprm->unsafe flags if LSM_UNSAFE_PTRACE is set
Eric Dumazet <edumazet(a)google.com>
sch_cbq: validate TCA_CBQ_WRROPT to avoid crash
Dotan Barak <dotanb(a)dev.mellanox.co.il>
net/rds: Fix error handling in rds_ib_add_one()
Dongli Zhang <dongli.zhang(a)oracle.com>
xen-netfront: do not use ~0U as error return value for xennet_fill_frags()
Eric Dumazet <edumazet(a)google.com>
sch_dsmark: fix potential NULL deref in dsmark_init()
Eric Dumazet <edumazet(a)google.com>
nfc: fix memory leak in llcp_sock_bind()
Navid Emamdoost <navid.emamdoost(a)gmail.com>
net: qlogic: Fix memory leak in ql_alloc_large_buffers
Paolo Abeni <pabeni(a)redhat.com>
net: ipv4: avoid mixed n_redirects and rate_tokens usage
Eric Dumazet <edumazet(a)google.com>
ipv6: drop incoming packets having a v4mapped source address
Johan Hovold <johan(a)kernel.org>
hso: fix NULL-deref on tty open
Martijn Coenen <maco(a)android.com>
ANDROID: binder: synchronize_rcu() when using POLLFREE.
Martijn Coenen <maco(a)android.com>
ANDROID: binder: remove waitqueue when thread exits.
Nicolas Boichat <drinkcat(a)chromium.org>
kmemleak: increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE default to 16K
Changwei Ge <gechangwei(a)live.cn>
ocfs2: wait for recovering done after direct unlock request
David Howells <dhowells(a)redhat.com>
hypfs: Fix error number left in struct pointer member
OGAWA Hirofumi <hirofumi(a)mail.parknet.co.jp>
fat: work around race with userspace's read via blockdev while mounting
Jia-Ju Bai <baijiaju1990(a)gmail.com>
security: smack: Fix possible null-pointer dereferences in smack_socket_sock_rcv_skb()
Joao Moreno <mail(a)joaomoreno.com>
HID: apple: Fix stuck function keys when using FN
Will Deacon <will(a)kernel.org>
ARM: 8898/1: mm: Don't treat faults reported from cache maintenance as writes
Kai-Heng Feng <kai.heng.feng(a)canonical.com>
mfd: intel-lpss: Remove D3cold delay
Bart Van Assche <bvanassche(a)acm.org>
scsi: core: Reduce memory required for SCSI logging
Nathan Lynch <nathanl(a)linux.ibm.com>
powerpc/pseries: correctly track irq state in default idle
Nicholas Piggin <npiggin(a)gmail.com>
powerpc/64s/exception: machine check use correct cfar for late handler
hexin <hexin.op(a)gmail.com>
vfio_pci: Restore original state on release
Sam Bobroff <sbobroff(a)linux.ibm.com>
powerpc/eeh: Clear stale EEH_DEV_NO_HANDLER flag
Sowjanya Komatineni <skomatineni(a)nvidia.com>
pinctrl: tegra: Fix write barrier placement in pmx_writel
Nathan Lynch <nathanl(a)linux.ibm.com>
powerpc/pseries/mobility: use cond_resched when updating device tree
Christophe Leroy <christophe.leroy(a)c-s.fr>
powerpc/futex: Fix warning: 'oldval' may be used uninitialized in this function
Nathan Lynch <nathanl(a)linux.ibm.com>
powerpc/rtas: use device model APIs and serialization during LPM
Stephen Boyd <sboyd(a)kernel.org>
clk: sirf: Don't reference clk_init_data after registration
Nathan Huckleberry <nhuck(a)google.com>
clk: qoriq: Fix -Wunused-const-variable
Corey Minyard <cminyard(a)mvista.com>
ipmi_si: Only schedule continuously in the thread in maintenance mode
Jia-Ju Bai <baijiaju1990(a)gmail.com>
gpu: drm: radeon: Fix a possible null-pointer dereference in radeon_connector_set_property()
Marko Kohtala <marko.kohtala(a)okoko.fi>
video: ssd1307fb: Start page range at page_offset
-------------
Diffstat:
Makefile | 4 +--
arch/arm/mm/fault.c | 4 +--
arch/arm/mm/fault.h | 1 +
arch/powerpc/include/asm/futex.h | 3 +-
arch/powerpc/kernel/eeh_driver.c | 11 ++++++-
arch/powerpc/kernel/exceptions-64s.S | 4 +++
arch/powerpc/kernel/rtas.c | 11 +++++--
arch/powerpc/platforms/pseries/mobility.c | 9 ++++++
arch/powerpc/platforms/pseries/setup.c | 3 ++
arch/s390/hypfs/inode.c | 9 +++---
drivers/android/binder.c | 26 +++++++++++++++-
drivers/char/ipmi/ipmi_si_intf.c | 24 ++++++++++++---
drivers/clk/clk-qoriq.c | 2 +-
drivers/clk/sirf/clk-common.c | 12 +++++---
drivers/gpu/drm/radeon/radeon_connectors.c | 2 +-
drivers/hid/hid-apple.c | 49 +++++++++++++++++-------------
drivers/mfd/intel-lpss-pci.c | 2 ++
drivers/net/ethernet/qlogic/qla3xxx.c | 1 +
drivers/net/usb/hso.c | 12 +++++---
drivers/net/xen-netfront.c | 17 ++++++-----
drivers/pinctrl/pinctrl-tegra.c | 4 ++-
drivers/scsi/scsi_logging.c | 48 ++---------------------------
drivers/vfio/pci/vfio_pci.c | 17 ++++++++---
drivers/video/fbdev/ssd1307fb.c | 2 +-
fs/fat/dir.c | 13 ++++++--
fs/fat/fatent.c | 3 ++
fs/ocfs2/dlm/dlmunlock.c | 23 +++++++++++---
include/scsi/scsi_dbg.h | 2 --
lib/Kconfig.debug | 2 +-
net/ipv4/route.c | 5 ++-
net/ipv6/ip6_input.c | 10 ++++++
net/nfc/llcp_sock.c | 7 ++++-
net/nfc/netlink.c | 6 ++--
net/rds/ib.c | 6 ++--
net/sched/sch_cbq.c | 27 +++++++++++++---
net/sched/sch_dsmark.c | 2 ++
security/smack/smack_access.c | 4 +--
security/smack/smack_lsm.c | 7 +++--
38 files changed, 257 insertions(+), 137 deletions(-)
Hi Sasha,
> On Oct 6, 2019, at 20:07, Sasha Levin <sashal(a)kernel.org> wrote:
>
> Hi,
>
> [This is an automated email]
>
> This commit has been processed because it contains a "Fixes:" tag,
> fixing commit: f7fac17ca925 xhci: Convert xhci_handshake() to use readl_poll_timeout_atomic().
>
> The bot has tested the following trees: v5.3.2, v5.2.18, v4.19.76, v4.14.146, v4.9.194, v4.4.194.
>
> v5.3.2: Build OK!
> v5.2.18: Build OK!
> v4.19.76: Build OK!
> v4.14.146: Build OK!
> v4.9.194: Failed to apply! Possible dependencies:
> 0b6c324c8b60 ("xhci: cleanup and refactor process_ctrl_td()")
> 0f1d832ed1fb ("usb: xhci: Add port test modes support for usb2.")
> 11644a765952 ("xhci: Add quirk to workaround the errata seen on Cavium Thunder-X2 Soc")
> 191edc5e2e51 ("xhci: Fix front USB ports on ASUS PRIME B350M-A")
> 1cc6d8617b91 ("usb: xhci: remove unnecessary second abort try")
> 2a72126de1bb ("xhci: Remove duplicate xhci urb giveback functions")
> 2d6d5769f82d ("xhci: fix non static symbol warning")
> 30a65b45bfb1 ("xhci: cleanup and refactor process_bulk_intr_td()")
> 446b31419cb1 ("xhci: refactor handle_tx_event() urb giveback")
> 4750bc78efdb ("usb: host: xhci support option to disable the xHCI USB2 HW LPM")
> 488dc164914f ("xhci: remove WARN_ON if dma mask is not set for platform devices")
> 4c39d4b949d3 ("usb: xhci: use bus->sysdev for DMA configuration")
> 505f581c48bc ("xhci: simplify if statement to make it more readable")
> 52ab86852f74 ("xhci: remove extra URB_SHORT_NOT_OK checks in xhci, core handles most cases")
> 6b7f40f71234 ("xhci: change xhci_set_link_state() to work with port structures")
> 76a35293b901 ("usb: host: xhci: simplify irq handler return")
> 9983a5fc39bf ("xhci: rename EP_HALT_PENDING to EP_STOP_CMD_PENDING")
> 9ef7fbbb4fdf ("xhci: Rename variables related to transfer descritpors")
> a6ff6cbf1fab ("usb: xhci: Add helper function xhci_set_power_on().")
> a7d57abcc8a5 ("xhci: workaround CSS timeout on AMD SNPS 3.0 xHC")
> d3519b9d9606 ("xhci: Manually give back cancelled URB if we can't queue it for cancel")
> d9f11ba9f107 ("xhci: Rework how we handle unresponsive or hoptlug removed hosts")
> e740b019d7c6 ("xhci: xhci-hub: use new port structures to get port address instead of port array")
> eaefcf246b56 ("xhci: change xhci_test_and_clear_bit() to use new port structure")
> f97c08ae329b ("xhci: rename endpoint related trb variables")
> f99265965b32 ("xhci: detect stop endpoint race using pending timer instead of counter.")
> ffd4b4fc0b9a ("xhci: Add helper to get xhci roothub from hcd")
>
> v4.4.194: Failed to apply! Possible dependencies:
> 11644a765952 ("xhci: Add quirk to workaround the errata seen on Cavium Thunder-X2 Soc")
> 191edc5e2e51 ("xhci: Fix front USB ports on ASUS PRIME B350M-A")
> 21939f003ad0 ("usb: host: xhci-plat: enable BROKEN_PED quirk if platform requested")
> 41135de1e7fd ("usb: xhci: add quirk flag for broken PED bits")
> 4750bc78efdb ("usb: host: xhci support option to disable the xHCI USB2 HW LPM")
> 488dc164914f ("xhci: remove WARN_ON if dma mask is not set for platform devices")
> 4c39d4b949d3 ("usb: xhci: use bus->sysdev for DMA configuration")
> 4efb2f694114 ("usb: host: xhci-plat: add struct xhci_plat_priv")
> 69307ccb9ad7 ("usb: xhci: bInterval quirk for TI TUSB73x0")
> 76f9502fe761 ("xhci: plat: adapt to unified device property interface")
> 9da5a1092b13 ("xhci: Bad Ethernet performance plugged in ASM1042A host")
> a3aef3793071 ("xhci: get rid of platform data")
> a7d57abcc8a5 ("xhci: workaround CSS timeout on AMD SNPS 3.0 xHC")
> dec08194ffec ("xhci: Limit USB2 port wake support for AMD Promontory hosts")
> def4e6f7b419 ("xhci: refactor and cleanup endpoint initialization.")
>
>
> NOTE: The patch will not be queued to stable trees until it is upstream.
Where do I send backport for v4.4 and v4.9?
Kai-Heng
>
> How should we proceed with this patch?
>
> --
> Thanks,
> Sasha
When filtering xattr list for reading, presence of trusted xattr
results in a security audit log. However, if there is other content
no errno will be set, and if there isn't, the errno will be -ENODATA
and not -EPERM as is usually associated with a lack of capability.
The check does not block the request to list the xattrs present.
Switch to has_capability_noaudit to reflect a more appropriate check.
Signed-off-by: Mark Salyzyn <salyzyn(a)android.com>
Cc: linux-security-module(a)vger.kernel.org
Cc: kernel-team(a)android.com
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: stable(a)vger.kernel.org # v3.18
Fixes: upstream a082c6f680da ("ovl: filter trusted xattr for non-admin")
Fixes: 3.18 4bcc9b4b3a0a ("ovl: filter trusted xattr for non-admin")
---
Replaced ns_capable_noaudit with 3.18.y tree specific
has_capability_noaudit present in original submission to kernel.org
commit 5c2e9f346b815841f9bed6029ebcb06415caf640
("ovl: filter of trusted xattr results in audit")
fs/overlayfs/inode.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index a01ec1836a72..1175efa5e956 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -265,7 +265,8 @@ static bool ovl_can_list(const char *s)
return true;
/* Never list trusted.overlay, list other trusted for superuser only */
- return !ovl_is_private_xattr(s) && capable(CAP_SYS_ADMIN);
+ return !ovl_is_private_xattr(s) &&
+ has_capability_noaudit(current, CAP_SYS_ADMIN);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
--
2.23.0.581.g78d2f28ef7-goog
This is the start of the stable review cycle for the 4.14.148 release.
There are 68 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Tue 08 Oct 2019 05:07:10 PM UTC.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.14.148-r…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.14.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.14.148-rc1
Tetsuo Handa <penguin-kernel(a)I-love.SAKURA.ne.jp>
kexec: bail out upon SIGKILL when allocating memory.
Andrey Konovalov <andreyknvl(a)google.com>
NFC: fix attrs checks in netlink interface
Eric Biggers <ebiggers(a)google.com>
smack: use GFP_NOFS while holding inode_smack::smk_lock
Jann Horn <jannh(a)google.com>
Smack: Don't ignore other bprm->unsafe flags if LSM_UNSAFE_PTRACE is set
David Ahern <dsahern(a)gmail.com>
ipv6: Handle missing host route in __ipv6_ifa_notify
Eric Dumazet <edumazet(a)google.com>
sch_cbq: validate TCA_CBQ_WRROPT to avoid crash
Tuong Lien <tuong.t.lien(a)dektech.com.au>
tipc: fix unlimited bundling of small messages
Dongli Zhang <dongli.zhang(a)oracle.com>
xen-netfront: do not use ~0U as error return value for xennet_fill_frags()
Dotan Barak <dotanb(a)dev.mellanox.co.il>
net/rds: Fix error handling in rds_ib_add_one()
Dexuan Cui <decui(a)microsoft.com>
vsock: Fix a lockdep warning in __vsock_release()
Eric Dumazet <edumazet(a)google.com>
sch_dsmark: fix potential NULL deref in dsmark_init()
Reinhard Speyerer <rspmn(a)arcor.de>
qmi_wwan: add support for Cinterion CLS8 devices
Eric Dumazet <edumazet(a)google.com>
nfc: fix memory leak in llcp_sock_bind()
Martin KaFai Lau <kafai(a)fb.com>
net: Unpublish sk from sk_reuseport_cb before call_rcu
Navid Emamdoost <navid.emamdoost(a)gmail.com>
net: qlogic: Fix memory leak in ql_alloc_large_buffers
Paolo Abeni <pabeni(a)redhat.com>
net: ipv4: avoid mixed n_redirects and rate_tokens usage
Eric Dumazet <edumazet(a)google.com>
ipv6: drop incoming packets having a v4mapped source address
Johan Hovold <johan(a)kernel.org>
hso: fix NULL-deref on tty open
Haishuang Yan <yanhaishuang(a)cmss.chinamobile.com>
erspan: remove the incorrect mtu limit for erspan
Vishal Kulkarni <vishal(a)chelsio.com>
cxgb4:Fix out-of-bounds MSI-X info array access
Daniel Borkmann <daniel(a)iogearbox.net>
bpf: fix use after free in prog symbol exposure
Nicolas Boichat <drinkcat(a)chromium.org>
kmemleak: increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE default to 16K
Changwei Ge <gechangwei(a)live.cn>
ocfs2: wait for recovering done after direct unlock request
Greg Thelen <gthelen(a)google.com>
kbuild: clean compressed initramfs image
David Howells <dhowells(a)redhat.com>
hypfs: Fix error number left in struct pointer member
Jens Axboe <axboe(a)kernel.dk>
pktcdvd: remove warning on attempting to register non-passthrough dev
OGAWA Hirofumi <hirofumi(a)mail.parknet.co.jp>
fat: work around race with userspace's read via blockdev while mounting
Mike Rapoport <mike.rapoport(a)gmail.com>
ARM: 8903/1: ensure that usable memory in bank 0 starts from a PMD-aligned address
Jia-Ju Bai <baijiaju1990(a)gmail.com>
security: smack: Fix possible null-pointer dereferences in smack_socket_sock_rcv_skb()
Thierry Reding <treding(a)nvidia.com>
PCI: exynos: Propagate errors for optional PHYs
Thierry Reding <treding(a)nvidia.com>
PCI: imx6: Propagate errors for optional regulators
Thierry Reding <treding(a)nvidia.com>
PCI: rockchip: Propagate errors for optional regulators
Joao Moreno <mail(a)joaomoreno.com>
HID: apple: Fix stuck function keys when using FN
Anson Huang <Anson.Huang(a)nxp.com>
rtc: snvs: fix possible race condition
Will Deacon <will(a)kernel.org>
ARM: 8898/1: mm: Don't treat faults reported from cache maintenance as writes
Miroslav Benes <mbenes(a)suse.cz>
livepatch: Nullify obj->mod in klp_module_coming()'s error path
Nishka Dasgupta <nishkadg.linux(a)gmail.com>
PCI: tegra: Fix OF node reference leak
Kai-Heng Feng <kai.heng.feng(a)canonical.com>
mfd: intel-lpss: Remove D3cold delay
Hans de Goede <hdegoede(a)redhat.com>
i2c-cht-wc: Fix lockdep warning
Nathan Chancellor <natechancellor(a)gmail.com>
MIPS: tlbex: Explicitly cast _PAGE_NO_EXEC to a boolean
Chris Wilson <chris(a)chris-wilson.co.uk>
dma-buf/sw_sync: Synchronize signal vs syncpt free
Bart Van Assche <bvanassche(a)acm.org>
scsi: core: Reduce memory required for SCSI logging
Eugen Hristev <eugen.hristev(a)microchip.com>
clk: at91: select parent if main oscillator or bypass is enabled
Arnd Bergmann <arnd(a)arndb.de>
arm64: fix unreachable code issue with cmpxchg
Nathan Lynch <nathanl(a)linux.ibm.com>
powerpc/pseries: correctly track irq state in default idle
Nicholas Piggin <npiggin(a)gmail.com>
powerpc/64s/exception: machine check use correct cfar for late handler
Jean Delvare <jdelvare(a)suse.de>
drm/amdgpu/si: fix ASIC tests
Mark Menzynski <mmenzyns(a)redhat.com>
drm/nouveau/volt: Fix for some cards having 0 maximum voltage
hexin <hexin.op(a)gmail.com>
vfio_pci: Restore original state on release
Sowjanya Komatineni <skomatineni(a)nvidia.com>
pinctrl: tegra: Fix write barrier placement in pmx_writel
Nathan Lynch <nathanl(a)linux.ibm.com>
powerpc/pseries/mobility: use cond_resched when updating device tree
Christophe Leroy <christophe.leroy(a)c-s.fr>
powerpc/futex: Fix warning: 'oldval' may be used uninitialized in this function
Nathan Lynch <nathanl(a)linux.ibm.com>
powerpc/rtas: use device model APIs and serialization during LPM
Cédric Le Goater <clg(a)kaod.org>
powerpc/xmon: Check for HV mode when dumping XIVE info from OPAL
Stephen Boyd <sboyd(a)kernel.org>
clk: zx296718: Don't reference clk_init_data after registration
Stephen Boyd <sboyd(a)kernel.org>
clk: sirf: Don't reference clk_init_data after registration
Icenowy Zheng <icenowy(a)aosc.io>
clk: sunxi-ng: v3s: add missing clock slices for MMC2 module clocks
Nathan Huckleberry <nhuck(a)google.com>
clk: qoriq: Fix -Wunused-const-variable
Corey Minyard <cminyard(a)mvista.com>
ipmi_si: Only schedule continuously in the thread in maintenance mode
Jia-Ju Bai <baijiaju1990(a)gmail.com>
gpu: drm: radeon: Fix a possible null-pointer dereference in radeon_connector_set_property()
KyleMahlkuch <kmahlkuc(a)linux.vnet.ibm.com>
drm/radeon: Fix EEH during kexec
Ahmad Fatoum <a.fatoum(a)pengutronix.de>
drm/stm: attach gem fence to atomic state
Marko Kohtala <marko.kohtala(a)okoko.fi>
video: ssd1307fb: Start page range at page_offset
Lucas Stach <l.stach(a)pengutronix.de>
drm/panel: simple: fix AUO g185han01 horizontal blanking
Andrey Smirnov <andrew.smirnov(a)gmail.com>
drm/bridge: tc358767: Increase AUX transfer length limit
Vadim Sukhomlinov <sukhomlinov(a)google.com>
tpm: Fix TPM 1.2 Shutdown sequence to prevent future TPM operations
Jarkko Sakkinen <jarkko.sakkinen(a)linux.intel.com>
tpm: use tpm_try_get_ops() in tpm-sysfs.c.
Jarkko Sakkinen <jarkko.sakkinen(a)linux.intel.com>
tpm: migrate pubek_show to struct tpm_buf
-------------
Diffstat:
Makefile | 4 +-
arch/arm/mm/fault.c | 4 +-
arch/arm/mm/fault.h | 1 +
arch/arm/mm/mmu.c | 16 ++
arch/arm64/include/asm/cmpxchg.h | 6 +-
arch/mips/mm/tlbex.c | 2 +-
arch/powerpc/include/asm/futex.h | 3 +-
arch/powerpc/kernel/exceptions-64s.S | 4 +
arch/powerpc/kernel/rtas.c | 11 +-
arch/powerpc/platforms/pseries/mobility.c | 9 ++
arch/powerpc/platforms/pseries/setup.c | 3 +
arch/powerpc/xmon/xmon.c | 15 +-
arch/s390/hypfs/inode.c | 9 +-
drivers/block/pktcdvd.c | 1 -
drivers/char/ipmi/ipmi_si_intf.c | 24 ++-
drivers/char/tpm/tpm-chip.c | 5 +-
drivers/char/tpm/tpm-sysfs.c | 201 ++++++++++++++----------
drivers/char/tpm/tpm.h | 13 --
drivers/clk/at91/clk-main.c | 10 +-
drivers/clk/clk-qoriq.c | 2 +-
drivers/clk/sirf/clk-common.c | 12 +-
drivers/clk/sunxi-ng/ccu-sun8i-v3s.c | 3 +
drivers/clk/zte/clk-zx296718.c | 109 ++++++-------
drivers/dma-buf/sw_sync.c | 16 +-
drivers/gpu/drm/amd/amdgpu/si.c | 6 +-
drivers/gpu/drm/bridge/tc358767.c | 2 +-
drivers/gpu/drm/nouveau/nvkm/subdev/bios/volt.c | 2 +
drivers/gpu/drm/panel/panel-simple.c | 6 +-
drivers/gpu/drm/radeon/radeon_connectors.c | 2 +-
drivers/gpu/drm/radeon/radeon_drv.c | 8 +
drivers/gpu/drm/stm/ltdc.c | 2 +
drivers/hid/hid-apple.c | 49 +++---
drivers/i2c/busses/i2c-cht-wc.c | 46 ++++++
drivers/mfd/intel-lpss-pci.c | 2 +
drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c | 9 +-
drivers/net/ethernet/qlogic/qla3xxx.c | 1 +
drivers/net/usb/hso.c | 12 +-
drivers/net/usb/qmi_wwan.c | 1 +
drivers/net/xen-netfront.c | 17 +-
drivers/pci/dwc/pci-exynos.c | 2 +-
drivers/pci/dwc/pci-imx6.c | 4 +-
drivers/pci/host/pci-tegra.c | 22 ++-
drivers/pci/host/pcie-rockchip.c | 16 +-
drivers/pinctrl/tegra/pinctrl-tegra.c | 4 +-
drivers/rtc/rtc-snvs.c | 11 +-
drivers/scsi/scsi_logging.c | 48 +-----
drivers/vfio/pci/vfio_pci.c | 17 +-
drivers/video/fbdev/ssd1307fb.c | 2 +-
fs/fat/dir.c | 13 +-
fs/fat/fatent.c | 3 +
fs/ocfs2/dlm/dlmunlock.c | 23 ++-
include/scsi/scsi_dbg.h | 2 -
kernel/bpf/syscall.c | 30 ++--
kernel/kexec_core.c | 2 +
kernel/livepatch/core.c | 1 +
lib/Kconfig.debug | 2 +-
net/core/sock.c | 11 +-
net/ipv4/ip_gre.c | 1 +
net/ipv4/route.c | 5 +-
net/ipv6/addrconf.c | 17 +-
net/ipv6/ip6_input.c | 10 ++
net/nfc/llcp_sock.c | 7 +-
net/nfc/netlink.c | 6 +-
net/rds/ib.c | 6 +-
net/sched/sch_cbq.c | 30 +++-
net/sched/sch_dsmark.c | 2 +
net/tipc/link.c | 30 ++--
net/tipc/msg.c | 5 +-
net/vmw_vsock/af_vsock.c | 16 +-
net/vmw_vsock/hyperv_transport.c | 2 +-
net/vmw_vsock/virtio_transport_common.c | 2 +-
security/smack/smack_access.c | 6 +-
security/smack/smack_lsm.c | 7 +-
usr/Makefile | 3 +
74 files changed, 626 insertions(+), 390 deletions(-)
The following commit has been merged into the perf/urgent branch of tip:
Commit-ID: b59711e9b0d22fd47abfa00602fd8c365cdd3ab7
Gitweb: https://git.kernel.org/tip/b59711e9b0d22fd47abfa00602fd8c365cdd3ab7
Author: Steve MacLean <Steve.MacLean(a)microsoft.com>
AuthorDate: Sat, 28 Sep 2019 01:41:18
Committer: Arnaldo Carvalho de Melo <acme(a)redhat.com>
CommitterDate: Mon, 30 Sep 2019 17:29:49 -03:00
perf inject jit: Fix JIT_CODE_MOVE filename
During perf inject --jit, JIT_CODE_MOVE records were injecting MMAP records
with an incorrect filename. Specifically it was missing the ".so" suffix.
Further the JIT_CODE_LOAD record were silently truncating the
jr->load.code_index field to 32 bits before generating the filename.
Make both records emit the same filename based on the full 64 bit
code_index field.
Fixes: 9b07e27f88b9 ("perf inject: Add jitdump mmap injection support")
Cc: stable(a)vger.kernel.org # v4.6+
Signed-off-by: Steve MacLean <Steve.MacLean(a)Microsoft.com>
Acked-by: Jiri Olsa <jolsa(a)kernel.org>
Cc: Alexander Shishkin <alexander.shishkin(a)linux.intel.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: Brian Robbins <brianrob(a)microsoft.com>
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Eric Saint-Etienne <eric.saint.etienne(a)oracle.com>
Cc: John Keeping <john(a)metanate.com>
Cc: John Salem <josalem(a)microsoft.com>
Cc: Leo Yan <leo.yan(a)linaro.org>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Song Liu <songliubraving(a)fb.com>
Cc: Stephane Eranian <eranian(a)google.com>
Cc: Tom McDonald <thomas.mcdonald(a)microsoft.com>
Link: http://lore.kernel.org/lkml/BN8PR21MB1362FF8F127B31DBF4121528F7800@BN8PR21M…
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
---
tools/perf/util/jitdump.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c
index 1bdf4c6..e3ccb0c 100644
--- a/tools/perf/util/jitdump.c
+++ b/tools/perf/util/jitdump.c
@@ -395,7 +395,7 @@ static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr)
size_t size;
u16 idr_size;
const char *sym;
- uint32_t count;
+ uint64_t count;
int ret, csize, usize;
pid_t pid, tid;
struct {
@@ -418,7 +418,7 @@ static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr)
return -1;
filename = event->mmap2.filename;
- size = snprintf(filename, PATH_MAX, "%s/jitted-%d-%u.so",
+ size = snprintf(filename, PATH_MAX, "%s/jitted-%d-%" PRIu64 ".so",
jd->dir,
pid,
count);
@@ -529,7 +529,7 @@ static int jit_repipe_code_move(struct jit_buf_desc *jd, union jr_entry *jr)
return -1;
filename = event->mmap2.filename;
- size = snprintf(filename, PATH_MAX, "%s/jitted-%d-%"PRIu64,
+ size = snprintf(filename, PATH_MAX, "%s/jitted-%d-%" PRIu64 ".so",
jd->dir,
pid,
jr->move.code_index);
The following commit has been merged into the efi/urgent branch of tip:
Commit-ID: c05f8f92b701576b615f30aac31fabdc0648649b
Gitweb: https://git.kernel.org/tip/c05f8f92b701576b615f30aac31fabdc0648649b
Author: Ard Biesheuvel <ard.biesheuvel(a)linaro.org>
AuthorDate: Wed, 02 Oct 2019 18:58:59 +02:00
Committer: Ingo Molnar <mingo(a)kernel.org>
CommitterDate: Mon, 07 Oct 2019 15:24:35 +02:00
efivar/ssdt: Don't iterate over EFI vars if no SSDT override was specified
The kernel command line option efivar_ssdt= allows the name to be
specified of an EFI variable containing an ACPI SSDT table that should
be loaded into memory by the OS, and treated as if it was provided by
the firmware.
Currently, that code will always iterate over the EFI variables and
compare each name with the provided name, even if the command line
option wasn't set to begin with.
So bail early when no variable name was provided. This works around a
boot regression on the 2012 Mac Pro, as reported by Scott.
Tested-by: Scott Talbert <swt(a)techie.net>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel(a)linaro.org>
Cc: <stable(a)vger.kernel.org> # v4.9+
Cc: Ben Dooks <ben.dooks(a)codethink.co.uk>
Cc: Dave Young <dyoung(a)redhat.com>
Cc: Jarkko Sakkinen <jarkko.sakkinen(a)linux.intel.com>
Cc: Jerry Snitselaar <jsnitsel(a)redhat.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Lukas Wunner <lukas(a)wunner.de>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Garrett <mjg59(a)google.com>
Cc: Octavian Purdila <octavian.purdila(a)intel.com>
Cc: Peter Jones <pjones(a)redhat.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: linux-efi(a)vger.kernel.org
Cc: linux-integrity(a)vger.kernel.org
Fixes: 475fb4e8b2f4 ("efi / ACPI: load SSTDs from EFI variables")
Link: https://lkml.kernel.org/r/20191002165904.8819-3-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
---
drivers/firmware/efi/efi.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 8d3e778..69f00f7 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -267,6 +267,9 @@ static __init int efivar_ssdt_load(void)
void *data;
int ret;
+ if (!efivar_ssdt[0])
+ return 0;
+
ret = efivar_init(efivar_ssdt_iter, &entries, true, &entries);
list_for_each_entry_safe(entry, aux, &entries, list) {
The following commit has been merged into the efi/urgent branch of tip:
Commit-ID: e658c82be5561412c5e83b5e74e9da4830593f3e
Gitweb: https://git.kernel.org/tip/e658c82be5561412c5e83b5e74e9da4830593f3e
Author: Jerry Snitselaar <jsnitsel(a)redhat.com>
AuthorDate: Wed, 02 Oct 2019 18:59:02 +02:00
Committer: Ingo Molnar <mingo(a)kernel.org>
CommitterDate: Mon, 07 Oct 2019 15:24:36 +02:00
efi/tpm: Only set 'efi_tpm_final_log_size' after successful event log parsing
If __calc_tpm2_event_size() fails to parse an event it will return 0,
resulting tpm2_calc_event_log_size() returning -1. Currently there is
no check of this return value, and 'efi_tpm_final_log_size' can end up
being set to this negative value resulting in a crash like this one:
BUG: unable to handle page fault for address: ffffbc8fc00866ad
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
RIP: 0010:memcpy_erms+0x6/0x10
Call Trace:
tpm_read_log_efi()
tpm_bios_log_setup()
tpm_chip_register()
tpm_tis_core_init.cold.9+0x28c/0x466
tpm_tis_plat_probe()
platform_drv_probe()
...
Also __calc_tpm2_event_size() returns a size of 0 when it fails
to parse an event, so update function documentation to reflect this.
The root cause of the issue that caused the failure of event parsing
in this case is resolved by Peter Jone's patchset dealing with large
event logs where crossing over a page boundary causes the page with
the event count to be unmapped.
Signed-off-by: Jerry Snitselaar <jsnitsel(a)redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel(a)linaro.org>
Cc: Ben Dooks <ben.dooks(a)codethink.co.uk>
Cc: Dave Young <dyoung(a)redhat.com>
Cc: Jarkko Sakkinen <jarkko.sakkinen(a)linux.intel.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Lukas Wunner <lukas(a)wunner.de>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Garrett <mjg59(a)google.com>
Cc: Octavian Purdila <octavian.purdila(a)intel.com>
Cc: Peter Jones <pjones(a)redhat.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Scott Talbert <swt(a)techie.net>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: linux-efi(a)vger.kernel.org
Cc: linux-integrity(a)vger.kernel.org
Cc: stable(a)vger.kernel.org
Fixes: c46f3405692de ("tpm: Reserve the TPM final events table")
Link: https://lkml.kernel.org/r/20191002165904.8819-6-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
---
drivers/firmware/efi/tpm.c | 9 ++++++++-
include/linux/tpm_eventlog.h | 2 +-
2 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/drivers/firmware/efi/tpm.c b/drivers/firmware/efi/tpm.c
index b9ae5c6..703469c 100644
--- a/drivers/firmware/efi/tpm.c
+++ b/drivers/firmware/efi/tpm.c
@@ -85,11 +85,18 @@ int __init efi_tpm_eventlog_init(void)
final_tbl->nr_events,
log_tbl->log);
}
+
+ if (tbl_size < 0) {
+ pr_err(FW_BUG "Failed to parse event in TPM Final Events Log\n");
+ goto out_calc;
+ }
+
memblock_reserve((unsigned long)final_tbl,
tbl_size + sizeof(*final_tbl));
- early_memunmap(final_tbl, sizeof(*final_tbl));
efi_tpm_final_log_size = tbl_size;
+out_calc:
+ early_memunmap(final_tbl, sizeof(*final_tbl));
out:
early_memunmap(log_tbl, sizeof(*log_tbl));
return ret;
diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h
index b50cc3a..131ea1b 100644
--- a/include/linux/tpm_eventlog.h
+++ b/include/linux/tpm_eventlog.h
@@ -152,7 +152,7 @@ struct tcg_algorithm_info {
* total. Once we've done this we know the offset of the data length field,
* and can calculate the total size of the event.
*
- * Return: size of the event on success, <0 on failure
+ * Return: size of the event on success, 0 on failure
*/
static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
The following commit has been merged into the efi/urgent branch of tip:
Commit-ID: 05c8c1ff81ed2eb9bad7c27cf92e55c864c16df8
Gitweb: https://git.kernel.org/tip/05c8c1ff81ed2eb9bad7c27cf92e55c864c16df8
Author: Peter Jones <pjones(a)redhat.com>
AuthorDate: Wed, 02 Oct 2019 18:59:01 +02:00
Committer: Ingo Molnar <mingo(a)kernel.org>
CommitterDate: Mon, 07 Oct 2019 15:24:35 +02:00
efi/tpm: Don't traverse an event log with no events
When there are no entries to put into the final event log, some machines
will return the template they would have populated anyway. In this case
the nr_events field is 0, but the rest of the log is just garbage.
This patch stops us from trying to iterate the table with
__calc_tpm2_event_size() when the number of events in the table is 0.
Tested-by: Lyude Paul <lyude(a)redhat.com>
Signed-off-by: Peter Jones <pjones(a)redhat.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen(a)linux.intel.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel(a)linaro.org>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen(a)linux.intel.com>
Acked-by: Matthew Garrett <mjg59(a)google.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel(a)linaro.org>
Cc: Ben Dooks <ben.dooks(a)codethink.co.uk>
Cc: Dave Young <dyoung(a)redhat.com>
Cc: Jerry Snitselaar <jsnitsel(a)redhat.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Lukas Wunner <lukas(a)wunner.de>
Cc: Octavian Purdila <octavian.purdila(a)intel.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Scott Talbert <swt(a)techie.net>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: linux-efi(a)vger.kernel.org
Cc: linux-integrity(a)vger.kernel.org
Cc: stable(a)vger.kernel.org
Fixes: c46f3405692d ("tpm: Reserve the TPM final events table")
Link: https://lkml.kernel.org/r/20191002165904.8819-5-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
---
drivers/firmware/efi/tpm.c | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/drivers/firmware/efi/tpm.c b/drivers/firmware/efi/tpm.c
index 1d3f5ca..b9ae5c6 100644
--- a/drivers/firmware/efi/tpm.c
+++ b/drivers/firmware/efi/tpm.c
@@ -75,11 +75,16 @@ int __init efi_tpm_eventlog_init(void)
goto out;
}
- tbl_size = tpm2_calc_event_log_size((void *)efi.tpm_final_log
- + sizeof(final_tbl->version)
- + sizeof(final_tbl->nr_events),
- final_tbl->nr_events,
- log_tbl->log);
+ tbl_size = 0;
+ if (final_tbl->nr_events != 0) {
+ void *events = (void *)efi.tpm_final_log
+ + sizeof(final_tbl->version)
+ + sizeof(final_tbl->nr_events);
+
+ tbl_size = tpm2_calc_event_log_size(events,
+ final_tbl->nr_events,
+ log_tbl->log);
+ }
memblock_reserve((unsigned long)final_tbl,
tbl_size + sizeof(*final_tbl));
early_memunmap(final_tbl, sizeof(*final_tbl));
The following commit has been merged into the efi/urgent branch of tip:
Commit-ID: 047d50aee341d940350897c85799e56ae57c3849
Gitweb: https://git.kernel.org/tip/047d50aee341d940350897c85799e56ae57c3849
Author: Peter Jones <pjones(a)redhat.com>
AuthorDate: Wed, 02 Oct 2019 18:59:00 +02:00
Committer: Ingo Molnar <mingo(a)kernel.org>
CommitterDate: Mon, 07 Oct 2019 15:24:35 +02:00
efi/tpm: Don't access event->count when it isn't mapped
Some machines generate a lot of event log entries. When we're
iterating over them, the code removes the old mapping and adds a
new one, so once we cross the page boundary we're unmapping the page
with the count on it. Hilarity ensues.
This patch keeps the info from the header in local variables so we don't
need to access that page again or keep track of if it's mapped.
Tested-by: Lyude Paul <lyude(a)redhat.com>
Signed-off-by: Peter Jones <pjones(a)redhat.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen(a)linux.intel.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel(a)linaro.org>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen(a)linux.intel.com>
Acked-by: Matthew Garrett <mjg59(a)google.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel(a)linaro.org>
Cc: Ben Dooks <ben.dooks(a)codethink.co.uk>
Cc: Dave Young <dyoung(a)redhat.com>
Cc: Jerry Snitselaar <jsnitsel(a)redhat.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Lukas Wunner <lukas(a)wunner.de>
Cc: Octavian Purdila <octavian.purdila(a)intel.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Scott Talbert <swt(a)techie.net>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: linux-efi(a)vger.kernel.org
Cc: linux-integrity(a)vger.kernel.org
Cc: stable(a)vger.kernel.org
Fixes: 44038bc514a2 ("tpm: Abstract crypto agile event size calculations")
Link: https://lkml.kernel.org/r/20191002165904.8819-4-ard.biesheuvel@linaro.org
[ Minor edits. ]
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
---
include/linux/tpm_eventlog.h | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h
index 63238c8..b50cc3a 100644
--- a/include/linux/tpm_eventlog.h
+++ b/include/linux/tpm_eventlog.h
@@ -170,6 +170,7 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
u16 halg;
int i;
int j;
+ u32 count, event_type;
marker = event;
marker_start = marker;
@@ -190,16 +191,22 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
}
event = (struct tcg_pcr_event2_head *)mapping;
+ /*
+ * The loop below will unmap these fields if the log is larger than
+ * one page, so save them here for reference:
+ */
+ count = READ_ONCE(event->count);
+ event_type = READ_ONCE(event->event_type);
efispecid = (struct tcg_efi_specid_event_head *)event_header->event;
/* Check if event is malformed. */
- if (event->count > efispecid->num_algs) {
+ if (count > efispecid->num_algs) {
size = 0;
goto out;
}
- for (i = 0; i < event->count; i++) {
+ for (i = 0; i < count; i++) {
halg_size = sizeof(event->digests[i].alg_id);
/* Map the digest's algorithm identifier */
@@ -256,8 +263,9 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
+ event_field->event_size;
size = marker - marker_start;
- if ((event->event_type == 0) && (event_field->event_size == 0))
+ if (event_type == 0 && event_field->event_size == 0)
size = 0;
+
out:
if (do_mapping)
TPM_MEMUNMAP(mapping, mapping_size);
This is a note to let you know that I've just added the patch titled
staging: vt6655: Fix memory leak in vt6655_probe
to my staging git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git
in the staging-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 80b15db5e1e9c3300de299b2d43d1aafb593e6ac Mon Sep 17 00:00:00 2001
From: Navid Emamdoost <navid.emamdoost(a)gmail.com>
Date: Fri, 4 Oct 2019 15:03:15 -0500
Subject: staging: vt6655: Fix memory leak in vt6655_probe
In vt6655_probe, if vnt_init() fails the cleanup code needs to be called
like other error handling cases. The call to device_free_info() is
added.
Fixes: 67013f2c0e58 ("staging: vt6655: mac80211 conversion add main mac80211 functions")
Signed-off-by: Navid Emamdoost <navid.emamdoost(a)gmail.com>
Cc: stable <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20191004200319.22394-1-navid.emamdoost@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/staging/vt6655/device_main.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/staging/vt6655/device_main.c b/drivers/staging/vt6655/device_main.c
index c6bb4aaf9bd0..082302944c37 100644
--- a/drivers/staging/vt6655/device_main.c
+++ b/drivers/staging/vt6655/device_main.c
@@ -1748,8 +1748,10 @@ vt6655_probe(struct pci_dev *pcid, const struct pci_device_id *ent)
priv->hw->max_signal = 100;
- if (vnt_init(priv))
+ if (vnt_init(priv)) {
+ device_free_info(priv);
return -ENODEV;
+ }
device_print_info(priv);
pci_set_drvdata(pcid, priv);
--
2.23.0
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: rc: mark input device as pointing stick
Author: Sean Young <sean(a)mess.org>
Date: Sat Sep 28 17:46:14 2019 -0300
libinput refuses pointer movement from rc-core, since it believes it's not
a pointer-type device:
libinput error: event17 - Media Center Ed. eHome Infrared Remote Transceiver (1784:0008): libinput bug: REL_X/Y from a non-pointer device
Fixes: 158bc148a31e ("media: rc: mce_kbd: input events via rc-core's input device")
Fixes: 0ac5a603a732 ("media: rc: imon: report mouse events using rc-core's input device")
Cc: stable(a)vger.kernel.org # 4.20+
Signed-off-by: Sean Young <sean(a)mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung(a)kernel.org>
drivers/media/rc/rc-main.c | 1 +
1 file changed, 1 insertion(+)
---
diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c
index 13da4c5c7d17..7741151606ef 100644
--- a/drivers/media/rc/rc-main.c
+++ b/drivers/media/rc/rc-main.c
@@ -1773,6 +1773,7 @@ static int rc_prepare_rx_device(struct rc_dev *dev)
set_bit(MSC_SCAN, dev->input_dev->mscbit);
/* Pointer/mouse events */
+ set_bit(INPUT_PROP_POINTING_STICK, dev->input_dev->propbit);
set_bit(EV_REL, dev->input_dev->evbit);
set_bit(REL_X, dev->input_dev->relbit);
set_bit(REL_Y, dev->input_dev->relbit);
Calling 'panic()' on a kernel with CONFIG_PREEMPT=y can leave the
calling CPU in an infinite loop, but with interrupts and preemption
enabled. From this state, userspace can continue to be scheduled,
despite the system being "dead" as far as the kernel is concerned. This
is easily reproducible on arm64 when booting with "nosmp" on the command
line; a couple of shell scripts print out a periodic "Ping" message
whilst another triggers a crash by writing to /proc/sysrq-trigger:
| sysrq: Trigger a crash
| Kernel panic - not syncing: sysrq triggered crash
| CPU: 0 PID: 1 Comm: init Not tainted 5.2.15 #1
| Hardware name: linux,dummy-virt (DT)
| Call trace:
| dump_backtrace+0x0/0x148
| show_stack+0x14/0x20
| dump_stack+0xa0/0xc4
| panic+0x140/0x32c
| sysrq_handle_reboot+0x0/0x20
| __handle_sysrq+0x124/0x190
| write_sysrq_trigger+0x64/0x88
| proc_reg_write+0x60/0xa8
| __vfs_write+0x18/0x40
| vfs_write+0xa4/0x1b8
| ksys_write+0x64/0xf0
| __arm64_sys_write+0x14/0x20
| el0_svc_common.constprop.0+0xb0/0x168
| el0_svc_handler+0x28/0x78
| el0_svc+0x8/0xc
| Kernel Offset: disabled
| CPU features: 0x0002,24002004
| Memory Limit: none
| ---[ end Kernel panic - not syncing: sysrq triggered crash ]---
| Ping 2!
| Ping 1!
| Ping 1!
| Ping 2!
The issue can also be triggered on x86 kernels if CONFIG_SMP=n, otherwise
local interrupts are disabled in 'smp_send_stop()'.
Disable preemption in 'panic()' before re-enabling interrupts.
Cc: Russell King <linux(a)armlinux.org.uk>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Kees Cook <keescook(a)chromium.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/BX1W47JXPMR8.58IYW53H6M5N@dragonstone
Reported-by: Xogium <contact(a)xogium.me>
Signed-off-by: Will Deacon <will(a)kernel.org>
---
kernel/panic.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/kernel/panic.c b/kernel/panic.c
index 47e8ebccc22b..f470a038b05b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -180,6 +180,7 @@ void panic(const char *fmt, ...)
* after setting panic_cpu) from invoking panic() again.
*/
local_irq_disable();
+ preempt_disable_notrace();
/*
* It's possible to come here directly from a panic-assertion and
--
2.23.0.444.g18eeb5a265-goog
From: Vitaly Wool <vitalywool(a)gmail.com>
Subject: mm/z3fold.c: claim page in the beginning of free
There's a really hard to reproduce race in z3fold between z3fold_free()
and z3fold_reclaim_page(). z3fold_reclaim_page() can claim the page after
z3fold_free() has checked if the page was claimed and z3fold_free() will
then schedule this page for compaction which may in turn lead to random
page faults (since that page would have been reclaimed by then). Fix that
by claiming page in the beginning of z3fold_free() and not forgetting to
clear the claim in the end.
[vitalywool(a)gmail.com: v2]
Link: http://lkml.kernel.org/r/20190928113456.152742cf@bigdell
Link: http://lkml.kernel.org/r/20190926104844.4f0c6efa1366b8f5741eaba9@gmail.com
Signed-off-by: Vitaly Wool <vitalywool(a)gmail.com>
Reported-by: Markus Linnala <markus.linnala(a)gmail.com>
Cc: Dan Streetman <ddstreet(a)ieee.org>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Henry Burns <henrywolfeburns(a)gmail.com>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Markus Linnala <markus.linnala(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/z3fold.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
--- a/mm/z3fold.c~z3fold-claim-page-in-the-beginning-of-free
+++ a/mm/z3fold.c
@@ -998,9 +998,11 @@ static void z3fold_free(struct z3fold_po
struct z3fold_header *zhdr;
struct page *page;
enum buddy bud;
+ bool page_claimed;
zhdr = handle_to_z3fold_header(handle);
page = virt_to_page(zhdr);
+ page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
if (test_bit(PAGE_HEADLESS, &page->private)) {
/* if a headless page is under reclaim, just leave.
@@ -1008,7 +1010,7 @@ static void z3fold_free(struct z3fold_po
* has not been set before, we release this page
* immediately so we don't care about its value any more.
*/
- if (!test_and_set_bit(PAGE_CLAIMED, &page->private)) {
+ if (!page_claimed) {
spin_lock(&pool->lock);
list_del(&page->lru);
spin_unlock(&pool->lock);
@@ -1044,13 +1046,15 @@ static void z3fold_free(struct z3fold_po
atomic64_dec(&pool->pages_nr);
return;
}
- if (test_bit(PAGE_CLAIMED, &page->private)) {
+ if (page_claimed) {
+ /* the page has not been claimed by us */
z3fold_page_unlock(zhdr);
return;
}
if (unlikely(PageIsolated(page)) ||
test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
z3fold_page_unlock(zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
return;
}
if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
@@ -1060,10 +1064,12 @@ static void z3fold_free(struct z3fold_po
zhdr->cpu = -1;
kref_get(&zhdr->refcount);
do_compact_page(zhdr, true);
+ clear_bit(PAGE_CLAIMED, &page->private);
return;
}
kref_get(&zhdr->refcount);
queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
+ clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}
_
From: Michal Hocko <mhocko(a)suse.com>
Subject: kernel/sysctl.c: do not override max_threads provided by userspace
Partially revert 16db3d3f1170 ("kernel/sysctl.c: threads-max observe
limits") because the patch is causing a regression to any workload which
needs to override the auto-tuning of the limit provided by kernel.
set_max_threads is implementing a boot time guesstimate to provide a
sensible limit of the concurrently running threads so that runaways will
not deplete all the memory. This is a good thing in general but there are
workloads which might need to increase this limit for an application to
run (reportedly WebSpher MQ is affected) and that is simply not possible
after the mentioned change. It is also very dubious to override an admin
decision by an estimation that doesn't have any direct relation to
correctness of the kernel operation.
Fix this by dropping set_max_threads from sysctl_max_threads so any value
is accepted as long as it fits into MAX_THREADS which is important to
check because allowing more threads could break internal robust futex
restriction. While at it, do not use MIN_THREADS as the lower boundary
because it is also only a heuristic for automatic estimation and admin
might have a good reason to stop new threads to be created even when below
this limit.
This became more severe when we switched x86 from 4k to 8k kernel stacks.
Starting since 6538b8ea886e ("x86_64: expand kernel stack to 16K") (3.16)
we use THREAD_SIZE_ORDER = 2 and that halved the auto-tuned value.
In the particular case
3.12
kernel.threads-max = 515561
4.4
kernel.threads-max = 200000
Neither of the two values is really insane on 32GB machine.
I am not sure we want/need to tune the max_thread value further. If
anything the tuning should be removed altogether if proven not useful in
general. But we definitely need a way to override this auto-tuning.
Link: http://lkml.kernel.org/r/20190922065801.GB18814@dhcp22.suse.cz
Fixes: 16db3d3f1170 ("kernel/sysctl.c: threads-max observe limits")
Signed-off-by: Michal Hocko <mhocko(a)suse.com>
Reviewed-by: "Eric W. Biederman" <ebiederm(a)xmission.com>
Cc: Heinrich Schuchardt <xypron.glpk(a)gmx.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/fork.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/kernel/fork.c~kernel-sysctlc-do-not-override-max_threads-provided-by-userspace
+++ a/kernel/fork.c
@@ -2925,7 +2925,7 @@ int sysctl_max_threads(struct ctl_table
struct ctl_table t;
int ret;
int threads = max_threads;
- int min = MIN_THREADS;
+ int min = 1;
int max = MAX_THREADS;
t = *table;
@@ -2937,7 +2937,7 @@ int sysctl_max_threads(struct ctl_table
if (ret || !write)
return ret;
- set_max_threads(threads);
+ max_threads = threads;
return 0;
}
_
From: Will Deacon <will(a)kernel.org>
Subject: panic: ensure preemption is disabled during panic()
Calling 'panic()' on a kernel with CONFIG_PREEMPT=y can leave the calling
CPU in an infinite loop, but with interrupts and preemption enabled. From
this state, userspace can continue to be scheduled, despite the system
being "dead" as far as the kernel is concerned. This is easily
reproducible on arm64 when booting with "nosmp" on the command line; a
couple of shell scripts print out a periodic "Ping" message whilst another
triggers a crash by writing to /proc/sysrq-trigger:
| sysrq: Trigger a crash
| Kernel panic - not syncing: sysrq triggered crash
| CPU: 0 PID: 1 Comm: init Not tainted 5.2.15 #1
| Hardware name: linux,dummy-virt (DT)
| Call trace:
| dump_backtrace+0x0/0x148
| show_stack+0x14/0x20
| dump_stack+0xa0/0xc4
| panic+0x140/0x32c
| sysrq_handle_reboot+0x0/0x20
| __handle_sysrq+0x124/0x190
| write_sysrq_trigger+0x64/0x88
| proc_reg_write+0x60/0xa8
| __vfs_write+0x18/0x40
| vfs_write+0xa4/0x1b8
| ksys_write+0x64/0xf0
| __arm64_sys_write+0x14/0x20
| el0_svc_common.constprop.0+0xb0/0x168
| el0_svc_handler+0x28/0x78
| el0_svc+0x8/0xc
| Kernel Offset: disabled
| CPU features: 0x0002,24002004
| Memory Limit: none
| ---[ end Kernel panic - not syncing: sysrq triggered crash ]---
| Ping 2!
| Ping 1!
| Ping 1!
| Ping 2!
The issue can also be triggered on x86 kernels if CONFIG_SMP=n, otherwise
local interrupts are disabled in 'smp_send_stop()'.
Disable preemption in 'panic()' before re-enabling interrupts.
Link: http://lkml.kernel.org/r/20191002123538.22609-1-will@kernel.org
Link: https://lore.kernel.org/r/BX1W47JXPMR8.58IYW53H6M5N@dragonstone
Signed-off-by: Will Deacon <will(a)kernel.org>
Reported-by: Xogium <contact(a)xogium.me>
Reviewed-by: Kees Cook <keescook(a)chromium.org>
Cc: Russell King <linux(a)armlinux.org.uk>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Petr Mladek <pmladek(a)suse.com>
Cc: Feng Tang <feng.tang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/panic.c | 1 +
1 file changed, 1 insertion(+)
--- a/kernel/panic.c~panic-ensure-preemption-is-disabled-during-panic
+++ a/kernel/panic.c
@@ -180,6 +180,7 @@ void panic(const char *fmt, ...)
* after setting panic_cpu) from invoking panic() again.
*/
local_irq_disable();
+ preempt_disable_notrace();
/*
* It's possible to come here directly from a panic-assertion and
_
Hello,
We ran automated tests on a patchset that was proposed for merging into this
kernel tree. The patches were applied to:
Kernel repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
Commit: 40ce0335271a - Linux 5.3.4
The results of these automated tests are provided below.
Overall result: PASSED
Merge: OK
Compile: OK
Tests: OK
All kernel binaries, config files, and logs are available for download here:
https://artifacts.cki-project.org/pipelines/209791
Please reply to this email if you have any questions about the tests that we
ran or if you have any suggestions on how to make future tests more effective.
,-. ,-.
( C ) ( K ) Continuous
`-',-.`-' Kernel
( I ) Integration
`-'
______________________________________________________________________________
Merge testing
-------------
We cloned this repository and checked out the following commit:
Repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
Commit: 40ce0335271a - Linux 5.3.4
We grabbed the 3f4d541d6ee4 commit of the stable queue repository.
We then merged the patchset with `git am`:
drm-vkms-fix-crc-worker-races.patch
drm-mcde-fix-uninitialized-variable.patch
drm-bridge-tc358767-increase-aux-transfer-length-lim.patch
drm-vkms-avoid-assigning-0-for-possible_crtc.patch
drm-panel-simple-fix-auo-g185han01-horizontal-blanki.patch
drm-amd-display-add-monitor-patch-to-add-t7-delay.patch
drm-amd-display-power-gate-all-dscs-at-driver-init-t.patch
drm-amd-display-fix-not-calling-ppsmu-to-trigger-pme.patch
drm-amd-display-clear-fec_ready-shadow-register-if-d.patch
drm-amd-display-copy-gsl-groups-when-committing-a-ne.patch
video-ssd1307fb-start-page-range-at-page_offset.patch
drm-tinydrm-kconfig-drivers-select-backlight_class_d.patch
drm-stm-attach-gem-fence-to-atomic-state.patch
drm-bridge-sii902x-fix-missing-reference-to-mclk-clo.patch
drm-panel-check-failure-cases-in-the-probe-func.patch
drm-rockchip-check-for-fast-link-training-before-ena.patch
drm-amdgpu-fix-hard-hang-for-s-g-display-bos.patch
drm-amd-display-use-proper-enum-conversion-functions.patch
drm-radeon-fix-eeh-during-kexec.patch
gpu-drm-radeon-fix-a-possible-null-pointer-dereferen.patch
clk-imx8mq-mark-ahb-clock-as-critical.patch
pci-rpaphp-avoid-a-sometimes-uninitialized-warning.patch
pinctrl-stmfx-update-pinconf-settings.patch
ipmi_si-only-schedule-continuously-in-the-thread-in-.patch
clk-qoriq-fix-wunused-const-variable.patch
clk-ingenic-jz4740-fix-pll-half-divider-not-read-wri.patch
clk-sunxi-ng-v3s-add-missing-clock-slices-for-mmc2-m.patch
drm-amd-display-fix-issue-where-252-255-values-are-c.patch
drm-amd-display-fix-frames_to_insert-math.patch
drm-amd-display-reprogram-vm-config-when-system-resu.patch
drm-amd-display-register-vupdate_no_lock-interrupts-.patch
powerpc-powernv-ioda2-allocate-tce-table-levels-on-d.patch
clk-actions-don-t-reference-clk_init_data-after-regi.patch
clk-sirf-don-t-reference-clk_init_data-after-registr.patch
clk-meson-axg-audio-don-t-reference-clk_init_data-af.patch
clk-sprd-don-t-reference-clk_init_data-after-registr.patch
clk-zx296718-don-t-reference-clk_init_data-after-reg.patch
clk-sunxi-don-t-call-clk_hw_get_name-on-a-hw-that-is.patch
powerpc-xmon-check-for-hv-mode-when-dumping-xive-inf.patch
powerpc-rtas-use-device-model-apis-and-serialization.patch
powerpc-ptdump-fix-walk_pagetables-address-mismatch.patch
powerpc-futex-fix-warning-oldval-may-be-used-uniniti.patch
powerpc-64s-radix-fix-memory-hotplug-section-page-ta.patch
powerpc-pseries-mobility-use-cond_resched-when-updat.patch
powerpc-perf-fix-imc-allocation-failure-handling.patch
pinctrl-tegra-fix-write-barrier-placement-in-pmx_wri.patch
powerpc-eeh-clear-stale-eeh_dev_no_handler-flag.patch
vfio_pci-restore-original-state-on-release.patch
drm-amdgpu-sdma5-fix-number-of-sdma5-trap-irq-types-.patch
drm-nouveau-kms-tu102-disable-input-lut-when-input-i.patch
drm-nouveau-volt-fix-for-some-cards-having-0-maximum.patch
pinctrl-amd-disable-spurious-firing-gpio-irqs.patch
clk-renesas-mstp-set-genpd_flag_always_on-for-clock-.patch
clk-renesas-cpg-mssr-set-genpd_flag_always_on-for-cl.patch
drm-amd-display-support-spdif.patch
drm-amd-powerpaly-fix-navi-series-custom-peak-level-.patch
drm-amd-display-fix-mpo-hubp-underflow-with-scatter-.patch
drm-amd-display-fix-trigger-not-generated-for-freesy.patch
selftests-powerpc-retry-on-host-facility-unavailable.patch
kbuild-do-not-enable-wimplicit-fallthrough-for-clang.patch
drm-amdgpu-si-fix-asic-tests.patch
powerpc-64s-exception-machine-check-use-correct-cfar.patch
pstore-fs-superblock-limits.patch
powerpc-eeh-clean-up-eeh-pes-after-recovery-finishes.patch
clk-qcom-gcc-sdm845-use-floor-ops-for-sdcc-clks.patch
powerpc-pseries-correctly-track-irq-state-in-default.patch
pinctrl-meson-gxbb-fix-wrong-pinning-definition-for-.patch
mailbox-mediatek-cmdq-clear-the-event-in-cmdq-initia.patch
arm-dts-dir685-drop-spi-cpol-from-the-display.patch
arm64-fix-unreachable-code-issue-with-cmpxchg.patch
clk-at91-select-parent-if-main-oscillator-or-bypass-.patch
clk-imx-pll14xx-avoid-glitch-when-set-rate.patch
clk-imx-clk-pll14xx-unbypass-pll-by-default.patch
clk-make-clk_bulk_get_all-return-a-valid-id.patch
powerpc-dump-kernel-log-before-carrying-out-fadump-o.patch
mbox-qcom-add-apcs-child-device-for-qcs404.patch
clk-sprd-add-missing-kfree.patch
scsi-core-reduce-memory-required-for-scsi-logging.patch
dma-buf-sw_sync-synchronize-signal-vs-syncpt-free.patch
f2fs-fix-to-drop-meta-node-pages-during-umount.patch
ext4-fix-potential-use-after-free-after-remounting-w.patch
mips-ingenic-disable-broken-btb-lookup-optimization.patch
mips-don-t-use-bc_false-uninitialized-in-__mm_isbran.patch
mips-tlbex-explicitly-cast-_page_no_exec-to-a-boolea.patch
i2c-cht-wc-fix-lockdep-warning.patch
mfd-intel-lpss-remove-d3cold-delay.patch
pci-tegra-fix-of-node-reference-leak.patch
hid-wacom-fix-several-minor-compiler-warnings.patch
rtc-bd70528-fix-driver-dependencies.patch
mips-atomic-fix-loongson_llsc_mb-wreckage.patch
pci-pci-hyperv-fix-build-errors-on-non-sysfs-config.patch
pci-layerscape-add-the-bar_fixed_64bit-property-to-t.patch
livepatch-nullify-obj-mod-in-klp_module_coming-s-err.patch
mips-atomic-fix-smp_mb__-before-after-_atomic.patch
arm-8898-1-mm-don-t-treat-faults-reported-from-cache.patch
soundwire-intel-fix-channel-number-reported-by-hardw.patch
pci-mobiveil-fix-the-cpu-base-address-setup-in-inbou.patch
arm-8875-1-kconfig-default-to-aeabi-w-clang.patch
rtc-snvs-fix-possible-race-condition.patch
rtc-pcf85363-pcf85263-fix-regmap-error-in-set_time.patch
power-supply-register-hwmon-devices-with-valid-names.patch
selinux-fix-residual-uses-of-current_security-for-th.patch
pci-add-pci_info_ratelimited-to-ratelimit-pci-separa.patch
hid-apple-fix-stuck-function-keys-when-using-fn.patch
pci-rockchip-propagate-errors-for-optional-regulator.patch
pci-histb-propagate-errors-for-optional-regulators.patch
pci-imx6-propagate-errors-for-optional-regulators.patch
pci-exynos-propagate-errors-for-optional-phys.patch
security-smack-fix-possible-null-pointer-dereference.patch
pci-use-static-const-struct-not-const-static-struct.patch
arm-8905-1-emit-__gnu_mcount_nc-when-using-clang-10..patch
arm-8903-1-ensure-that-usable-memory-in-bank-0-start.patch
i2c-tegra-move-suspend-handling-to-noirq-phase.patch
block-bfq-push-up-injection-only-after-setting-servi.patch
fat-work-around-race-with-userspace-s-read-via-block.patch
pktcdvd-remove-warning-on-attempting-to-register-non.patch
hypfs-fix-error-number-left-in-struct-pointer-member.patch
tools-power-x86-intel-speed-select-fix-high-priority.patch
crypto-hisilicon-fix-double-free-in-sec_free_hw_sgl.patch
mm-add-dummy-can_do_mlock-helper.patch
kbuild-clean-compressed-initramfs-image.patch
ocfs2-wait-for-recovering-done-after-direct-unlock-r.patch
kmemleak-increase-debug_kmemleak_early_log_size-defa.patch
arm64-consider-stack-randomization-for-mmap-base-onl.patch
mips-properly-account-for-stack-randomization-and-st.patch
arm-properly-account-for-stack-randomization-and-sta.patch
arm-use-stack_top-when-computing-mmap-base-address.patch
cxgb4-fix-out-of-bounds-msi-x-info-array-access.patch
erspan-remove-the-incorrect-mtu-limit-for-erspan.patch
hso-fix-null-deref-on-tty-open.patch
ipv6-drop-incoming-packets-having-a-v4mapped-source-address.patch
ipv6-handle-missing-host-route-in-__ipv6_ifa_notify.patch
net-ipv4-avoid-mixed-n_redirects-and-rate_tokens-usage.patch
net-qlogic-fix-memory-leak-in-ql_alloc_large_buffers.patch
net-sched-taprio-fix-potential-integer-overflow-in-taprio_set_picos_per_byte.patch
net-unpublish-sk-from-sk_reuseport_cb-before-call_rcu.patch
nfc-fix-memory-leak-in-llcp_sock_bind.patch
qmi_wwan-add-support-for-cinterion-cls8-devices.patch
rxrpc-fix-rxrpc_recvmsg-tracepoint.patch
sch_cbq-validate-tca_cbq_wrropt-to-avoid-crash.patch
sch_dsmark-fix-potential-null-deref-in-dsmark_init.patch
tipc-fix-unlimited-bundling-of-small-messages.patch
udp-fix-gso_segs-calculations.patch
vsock-fix-a-lockdep-warning-in-__vsock_release.patch
net-dsa-rtl8366-check-vlan-id-and-not-ports.patch
tcp-adjust-rto_base-in-retransmits_timed_out.patch
udp-only-do-gso-if-of-segs-1.patch
net-rds-fix-error-handling-in-rds_ib_add_one.patch
net-dsa-sja1105-initialize-the-meta_lock.patch
xen-netfront-do-not-use-0u-as-error-return-value-for-xennet_fill_frags.patch
net-dsa-sja1105-fix-sleeping-while-atomic-in-.port_hwtstamp_set.patch
ptp_qoriq-initialize-the-registers-spinlock-before-calling-ptp_qoriq_settime.patch
net-dsa-sja1105-ensure-ptp-time-for-rxtstamp-reconstruction-is-not-in-the-past.patch
net-dsa-sja1105-prevent-leaking-memory.patch
net-socionext-netsec-always-grab-descriptor-lock.patch
net-sched-cbs-avoid-division-by-zero-when-calculating-the-port-rate.patch
net-sched-taprio-avoid-division-by-zero-on-invalid-link-speed.patch
smack-don-t-ignore-other-bprm-unsafe-flags-if-lsm_unsafe_ptrace-is-set.patch
smack-use-gfp_nofs-while-holding-inode_smack-smk_lock.patch
dm-raid-fix-updating-of-max_discard_sectors-limit.patch
dm-zoned-fix-invalid-memory-access.patch
Compile testing
---------------
We compiled the kernel for 3 architectures:
aarch64:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
ppc64le:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
x86_64:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
Hardware testing
----------------
We booted each kernel and ran the following tests:
aarch64:
Host 1:
✅ Boot test
✅ xfstests: ext4
✅ selinux-policy: serge-testsuite
✅ lvm thinp sanity
✅ storage: software RAID testing
🚧 ✅ Storage blktests
Host 2:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
✅ Boot test
✅ Podman system integration test (as root)
✅ Podman system integration test (as user)
✅ Loopdev Sanity
✅ jvm test suite
✅ AMTU (Abstract Machine Test Utility)
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking tunnel: gre basic
✅ audit: audit testsuite test
✅ httpd: mod_ssl smoke sanity
✅ iotop: sanity
✅ tuned: tune-processes-through-perf
✅ Usex - version 1.9-29
✅ storage: SCSI VPD
🚧 ⚡⚡⚡ LTP lite
🚧 ⚡⚡⚡ POSIX pjd-fstest suites
🚧 ⚡⚡⚡ Networking route: pmtu
🚧 ⚡⚡⚡ storage: dm/common
🚧 ⚡⚡⚡ Networking route_func: local
🚧 ⚡⚡⚡ Networking route_func: forward
ppc64le:
Host 1:
✅ Boot test
✅ Podman system integration test (as root)
✅ Podman system integration test (as user)
✅ Loopdev Sanity
✅ jvm test suite
✅ AMTU (Abstract Machine Test Utility)
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking tunnel: gre basic
✅ audit: audit testsuite test
✅ httpd: mod_ssl smoke sanity
✅ iotop: sanity
✅ tuned: tune-processes-through-perf
✅ Usex - version 1.9-29
🚧 ✅ LTP lite
🚧 ✅ POSIX pjd-fstest suites
🚧 ✅ Networking route: pmtu
🚧 ✅ storage: dm/common
🚧 ✅ Networking route_func: local
🚧 ✅ Networking route_func: forward
Host 2:
✅ Boot test
✅ xfstests: ext4
✅ selinux-policy: serge-testsuite
✅ lvm thinp sanity
✅ storage: software RAID testing
🚧 ✅ Storage blktests
x86_64:
Host 1:
✅ Boot test
✅ Podman system integration test (as root)
✅ Podman system integration test (as user)
✅ Loopdev Sanity
✅ jvm test suite
✅ AMTU (Abstract Machine Test Utility)
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking tunnel: gre basic
✅ audit: audit testsuite test
✅ httpd: mod_ssl smoke sanity
✅ iotop: sanity
✅ tuned: tune-processes-through-perf
✅ pciutils: sanity smoke test
✅ Usex - version 1.9-29
✅ storage: SCSI VPD
✅ stress: stress-ng
🚧 ✅ LTP lite
🚧 ✅ POSIX pjd-fstest suites
🚧 ✅ Networking route: pmtu
🚧 ✅ storage: dm/common
🚧 ✅ Networking route_func: local
🚧 ✅ Networking route_func: forward
Host 2:
✅ Boot test
✅ xfstests: ext4
✅ selinux-policy: serge-testsuite
✅ lvm thinp sanity
✅ storage: software RAID testing
🚧 ✅ Storage blktests
Host 3:
✅ Boot test
🚧 ✅ IPMI driver test
🚧 ✅ IPMItool loop stress test
Test sources: https://github.com/CKI-project/tests-beaker
💚 Pull requests are welcome for new tests or improvements to existing tests!
Waived tests
------------
If the test run included waived tests, they are marked with 🚧. Such tests are
executed but their results are not taken into account. Tests are waived when
their results are not reliable enough, e.g. when they're just introduced or are
being fixed.
Hello,
We ran automated tests on a recent commit from this kernel tree:
Kernel repo: git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Commit: ed56826f1779 - Linux 5.3.4
The results of these automated tests are provided below.
Overall result: FAILED (see details below)
Merge: OK
Compile: OK
Tests: FAILED
All kernel binaries, config files, and logs are available for download here:
https://artifacts.cki-project.org/pipelines/207647
One or more kernel tests failed:
x86_64:
❌ stress: stress-ng
We hope that these logs can help you find the problem quickly. For the full
detail on our testing procedures, please scroll to the bottom of this message.
Please reply to this email if you have any questions about the tests that we
ran or if you have any suggestions on how to make future tests more effective.
,-. ,-.
( C ) ( K ) Continuous
`-',-.`-' Kernel
( I ) Integration
`-'
______________________________________________________________________________
Compile testing
---------------
We compiled the kernel for 3 architectures:
aarch64:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
ppc64le:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
x86_64:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
Hardware testing
----------------
We booted each kernel and ran the following tests:
aarch64:
Host 1:
✅ Boot test
✅ xfstests: ext4
✅ xfstests: xfs
✅ selinux-policy: serge-testsuite
✅ lvm thinp sanity
✅ storage: software RAID testing
🚧 ✅ Storage blktests
Host 2:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
✅ Boot test
✅ Podman system integration test (as root)
✅ Podman system integration test (as user)
✅ Loopdev Sanity
✅ jvm test suite
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking sctp-auth: sockopts test
✅ Networking: igmp conformance test
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking tunnel: gre basic
✅ Networking tunnel: vxlan basic
✅ audit: audit testsuite test
✅ httpd: mod_ssl smoke sanity
✅ iotop: sanity
✅ tuned: tune-processes-through-perf
✅ Usex - version 1.9-29
✅ storage: SCSI VPD
🚧 ⚡⚡⚡ LTP lite
🚧 ⚡⚡⚡ CIFS Connectathon
🚧 ⚡⚡⚡ POSIX pjd-fstest suites
🚧 ⚡⚡⚡ Memory function: kaslr
🚧 ⚡⚡⚡ Networking bridge: sanity
🚧 ⚡⚡⚡ Networking MACsec: sanity
🚧 ⚡⚡⚡ Networking route: pmtu
🚧 ⚡⚡⚡ Networking tunnel: geneve basic test
🚧 ⚡⚡⚡ L2TP basic test
🚧 ⚡⚡⚡ Networking vnic: ipvlan/basic
🚧 ⚡⚡⚡ ALSA PCM loopback test
🚧 ⚡⚡⚡ ALSA Control (mixer) Userspace Element test
🚧 ⚡⚡⚡ storage: dm/common
🚧 ⚡⚡⚡ trace: ftrace/tracer
🚧 ⚡⚡⚡ Networking route_func: local
🚧 ⚡⚡⚡ Networking route_func: forward
🚧 ⚡⚡⚡ Networking ipsec: basic netns transport
🚧 ⚡⚡⚡ Networking ipsec: basic netns tunnel
ppc64le:
Host 1:
✅ Boot test
✅ Podman system integration test (as root)
✅ Podman system integration test (as user)
✅ Loopdev Sanity
✅ jvm test suite
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking sctp-auth: sockopts test
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking tunnel: gre basic
✅ Networking tunnel: vxlan basic
✅ audit: audit testsuite test
✅ httpd: mod_ssl smoke sanity
✅ iotop: sanity
✅ tuned: tune-processes-through-perf
✅ Usex - version 1.9-29
🚧 ✅ LTP lite
🚧 ✅ CIFS Connectathon
🚧 ✅ POSIX pjd-fstest suites
🚧 ✅ Memory function: kaslr
🚧 ✅ Networking bridge: sanity
🚧 ✅ Networking MACsec: sanity
🚧 ✅ Networking route: pmtu
🚧 ✅ Networking tunnel: geneve basic test
🚧 ✅ L2TP basic test
🚧 ✅ Networking ipsec: basic netns tunnel
🚧 ✅ Networking vnic: ipvlan/basic
🚧 ✅ ALSA PCM loopback test
🚧 ✅ ALSA Control (mixer) Userspace Element test
🚧 ✅ storage: dm/common
🚧 ✅ trace: ftrace/tracer
🚧 ✅ Networking route_func: local
🚧 ✅ Networking route_func: forward
Host 2:
✅ Boot test
✅ xfstests: ext4
✅ xfstests: xfs
✅ selinux-policy: serge-testsuite
✅ lvm thinp sanity
✅ storage: software RAID testing
🚧 ✅ Storage blktests
x86_64:
Host 1:
✅ Boot test
✅ Podman system integration test (as root)
✅ Podman system integration test (as user)
✅ Loopdev Sanity
✅ jvm test suite
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking sctp-auth: sockopts test
✅ Networking: igmp conformance test
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking tunnel: gre basic
✅ Networking tunnel: vxlan basic
✅ audit: audit testsuite test
✅ httpd: mod_ssl smoke sanity
✅ iotop: sanity
✅ tuned: tune-processes-through-perf
✅ pciutils: sanity smoke test
✅ Usex - version 1.9-29
✅ storage: SCSI VPD
❌ stress: stress-ng
🚧 ❌ LTP lite
🚧 ✅ CIFS Connectathon
🚧 ✅ POSIX pjd-fstest suites
🚧 ✅ Memory function: kaslr
🚧 ✅ Networking bridge: sanity
🚧 ✅ Networking MACsec: sanity
🚧 ✅ Networking route: pmtu
🚧 ✅ Networking tunnel: geneve basic test
🚧 ✅ L2TP basic test
🚧 ✅ Networking vnic: ipvlan/basic
🚧 ✅ ALSA PCM loopback test
🚧 ✅ ALSA Control (mixer) Userspace Element test
🚧 ✅ storage: dm/common
🚧 ✅ trace: ftrace/tracer
🚧 ✅ Networking route_func: local
🚧 ✅ Networking route_func: forward
🚧 ✅ Networking ipsec: basic netns transport
🚧 ✅ Networking ipsec: basic netns tunnel
Host 2:
✅ Boot test
✅ Storage SAN device stress - mpt3sas driver
Host 3:
✅ Boot test
🚧 ✅ IPMI driver test
🚧 ✅ IPMItool loop stress test
Host 4:
✅ Boot test
✅ Storage SAN device stress - megaraid_sas
Host 5:
✅ Boot test
✅ xfstests: ext4
✅ xfstests: xfs
✅ selinux-policy: serge-testsuite
✅ lvm thinp sanity
✅ storage: software RAID testing
🚧 ✅ IOMMU boot test
🚧 ✅ Storage blktests
Test sources: https://github.com/CKI-project/tests-beaker
💚 Pull requests are welcome for new tests or improvements to existing tests!
Waived tests
------------
If the test run included waived tests, they are marked with 🚧. Such tests are
executed but their results are not taken into account. Tests are waived when
their results are not reliable enough, e.g. when they're just introduced or are
being fixed.
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 1ea32c83c699df32689d329b2415796b7bfc2f6e Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb(a)linux.ibm.com>
Date: Thu, 29 Aug 2019 20:09:06 -0400
Subject: [PATCH] tpm_tis_core: Set TPM_CHIP_FLAG_IRQ before probing for
interrupts
The tpm_tis_core has to set the TPM_CHIP_FLAG_IRQ before probing for
interrupts since there is no other place in the code that would set
it.
Cc: linux-stable(a)vger.kernel.org
Fixes: 570a36097f30 ("tpm: drop 'irq' from struct tpm_vendor_specific")
Signed-off-by: Stefan Berger <stefanb(a)linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen(a)linux.intel.com>
diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c
index ffa9048d8f6c..270f43acbb77 100644
--- a/drivers/char/tpm/tpm_tis_core.c
+++ b/drivers/char/tpm/tpm_tis_core.c
@@ -981,6 +981,7 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq,
}
tpm_chip_start(chip);
+ chip->flags |= TPM_CHIP_FLAG_IRQ;
if (irq) {
tpm_tis_probe_irq_single(chip, intmask, IRQF_SHARED,
irq);
Hello,
We ran automated tests on a patchset that was proposed for merging into this
kernel tree. The patches were applied to:
Kernel repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
Commit: 40ce0335271a - Linux 5.3.4
The results of these automated tests are provided below.
Overall result: PASSED
Merge: OK
Compile: OK
Tests: OK
All kernel binaries, config files, and logs are available for download here:
https://artifacts.cki-project.org/pipelines/209191
Please reply to this email if you have any questions about the tests that we
ran or if you have any suggestions on how to make future tests more effective.
,-. ,-.
( C ) ( K ) Continuous
`-',-.`-' Kernel
( I ) Integration
`-'
______________________________________________________________________________
Merge testing
-------------
We cloned this repository and checked out the following commit:
Repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
Commit: 40ce0335271a - Linux 5.3.4
We grabbed the cb49cbf5789a commit of the stable queue repository.
We then merged the patchset with `git am`:
drm-vkms-fix-crc-worker-races.patch
drm-mcde-fix-uninitialized-variable.patch
drm-bridge-tc358767-increase-aux-transfer-length-lim.patch
drm-vkms-avoid-assigning-0-for-possible_crtc.patch
drm-panel-simple-fix-auo-g185han01-horizontal-blanki.patch
drm-amd-display-add-monitor-patch-to-add-t7-delay.patch
drm-amd-display-power-gate-all-dscs-at-driver-init-t.patch
drm-amd-display-fix-not-calling-ppsmu-to-trigger-pme.patch
drm-amd-display-clear-fec_ready-shadow-register-if-d.patch
drm-amd-display-copy-gsl-groups-when-committing-a-ne.patch
video-ssd1307fb-start-page-range-at-page_offset.patch
drm-tinydrm-kconfig-drivers-select-backlight_class_d.patch
drm-stm-attach-gem-fence-to-atomic-state.patch
drm-bridge-sii902x-fix-missing-reference-to-mclk-clo.patch
drm-panel-check-failure-cases-in-the-probe-func.patch
drm-rockchip-check-for-fast-link-training-before-ena.patch
drm-amdgpu-fix-hard-hang-for-s-g-display-bos.patch
drm-amd-display-use-proper-enum-conversion-functions.patch
drm-radeon-fix-eeh-during-kexec.patch
gpu-drm-radeon-fix-a-possible-null-pointer-dereferen.patch
clk-imx8mq-mark-ahb-clock-as-critical.patch
pci-rpaphp-avoid-a-sometimes-uninitialized-warning.patch
pinctrl-stmfx-update-pinconf-settings.patch
ipmi_si-only-schedule-continuously-in-the-thread-in-.patch
clk-qoriq-fix-wunused-const-variable.patch
clk-ingenic-jz4740-fix-pll-half-divider-not-read-wri.patch
clk-sunxi-ng-v3s-add-missing-clock-slices-for-mmc2-m.patch
drm-amd-display-fix-issue-where-252-255-values-are-c.patch
drm-amd-display-fix-frames_to_insert-math.patch
drm-amd-display-reprogram-vm-config-when-system-resu.patch
drm-amd-display-register-vupdate_no_lock-interrupts-.patch
powerpc-powernv-ioda2-allocate-tce-table-levels-on-d.patch
clk-actions-don-t-reference-clk_init_data-after-regi.patch
clk-sirf-don-t-reference-clk_init_data-after-registr.patch
clk-meson-axg-audio-don-t-reference-clk_init_data-af.patch
clk-sprd-don-t-reference-clk_init_data-after-registr.patch
clk-zx296718-don-t-reference-clk_init_data-after-reg.patch
clk-sunxi-don-t-call-clk_hw_get_name-on-a-hw-that-is.patch
powerpc-xmon-check-for-hv-mode-when-dumping-xive-inf.patch
powerpc-rtas-use-device-model-apis-and-serialization.patch
powerpc-ptdump-fix-walk_pagetables-address-mismatch.patch
powerpc-futex-fix-warning-oldval-may-be-used-uniniti.patch
powerpc-64s-radix-fix-memory-hotplug-section-page-ta.patch
powerpc-pseries-mobility-use-cond_resched-when-updat.patch
powerpc-perf-fix-imc-allocation-failure-handling.patch
pinctrl-tegra-fix-write-barrier-placement-in-pmx_wri.patch
powerpc-eeh-clear-stale-eeh_dev_no_handler-flag.patch
vfio_pci-restore-original-state-on-release.patch
drm-amdgpu-sdma5-fix-number-of-sdma5-trap-irq-types-.patch
drm-nouveau-kms-tu102-disable-input-lut-when-input-i.patch
drm-nouveau-volt-fix-for-some-cards-having-0-maximum.patch
pinctrl-amd-disable-spurious-firing-gpio-irqs.patch
clk-renesas-mstp-set-genpd_flag_always_on-for-clock-.patch
clk-renesas-cpg-mssr-set-genpd_flag_always_on-for-cl.patch
drm-amd-display-support-spdif.patch
drm-amd-powerpaly-fix-navi-series-custom-peak-level-.patch
drm-amd-display-fix-mpo-hubp-underflow-with-scatter-.patch
drm-amd-display-fix-trigger-not-generated-for-freesy.patch
selftests-powerpc-retry-on-host-facility-unavailable.patch
kbuild-do-not-enable-wimplicit-fallthrough-for-clang.patch
drm-amdgpu-si-fix-asic-tests.patch
powerpc-64s-exception-machine-check-use-correct-cfar.patch
pstore-fs-superblock-limits.patch
powerpc-eeh-clean-up-eeh-pes-after-recovery-finishes.patch
clk-qcom-gcc-sdm845-use-floor-ops-for-sdcc-clks.patch
powerpc-pseries-correctly-track-irq-state-in-default.patch
pinctrl-meson-gxbb-fix-wrong-pinning-definition-for-.patch
mailbox-mediatek-cmdq-clear-the-event-in-cmdq-initia.patch
arm-dts-dir685-drop-spi-cpol-from-the-display.patch
arm64-fix-unreachable-code-issue-with-cmpxchg.patch
clk-at91-select-parent-if-main-oscillator-or-bypass-.patch
clk-imx-pll14xx-avoid-glitch-when-set-rate.patch
clk-imx-clk-pll14xx-unbypass-pll-by-default.patch
clk-make-clk_bulk_get_all-return-a-valid-id.patch
powerpc-dump-kernel-log-before-carrying-out-fadump-o.patch
mbox-qcom-add-apcs-child-device-for-qcs404.patch
clk-sprd-add-missing-kfree.patch
scsi-core-reduce-memory-required-for-scsi-logging.patch
dma-buf-sw_sync-synchronize-signal-vs-syncpt-free.patch
f2fs-fix-to-drop-meta-node-pages-during-umount.patch
ext4-fix-potential-use-after-free-after-remounting-w.patch
mips-ingenic-disable-broken-btb-lookup-optimization.patch
mips-don-t-use-bc_false-uninitialized-in-__mm_isbran.patch
mips-tlbex-explicitly-cast-_page_no_exec-to-a-boolea.patch
i2c-cht-wc-fix-lockdep-warning.patch
mfd-intel-lpss-remove-d3cold-delay.patch
pci-tegra-fix-of-node-reference-leak.patch
hid-wacom-fix-several-minor-compiler-warnings.patch
rtc-bd70528-fix-driver-dependencies.patch
mips-atomic-fix-loongson_llsc_mb-wreckage.patch
pci-pci-hyperv-fix-build-errors-on-non-sysfs-config.patch
pci-layerscape-add-the-bar_fixed_64bit-property-to-t.patch
livepatch-nullify-obj-mod-in-klp_module_coming-s-err.patch
mips-atomic-fix-smp_mb__-before-after-_atomic.patch
arm-8898-1-mm-don-t-treat-faults-reported-from-cache.patch
soundwire-intel-fix-channel-number-reported-by-hardw.patch
pci-mobiveil-fix-the-cpu-base-address-setup-in-inbou.patch
arm-8875-1-kconfig-default-to-aeabi-w-clang.patch
rtc-snvs-fix-possible-race-condition.patch
rtc-pcf85363-pcf85263-fix-regmap-error-in-set_time.patch
power-supply-register-hwmon-devices-with-valid-names.patch
selinux-fix-residual-uses-of-current_security-for-th.patch
pci-add-pci_info_ratelimited-to-ratelimit-pci-separa.patch
hid-apple-fix-stuck-function-keys-when-using-fn.patch
pci-rockchip-propagate-errors-for-optional-regulator.patch
pci-histb-propagate-errors-for-optional-regulators.patch
pci-imx6-propagate-errors-for-optional-regulators.patch
pci-exynos-propagate-errors-for-optional-phys.patch
security-smack-fix-possible-null-pointer-dereference.patch
pci-use-static-const-struct-not-const-static-struct.patch
arm-8905-1-emit-__gnu_mcount_nc-when-using-clang-10..patch
arm-8903-1-ensure-that-usable-memory-in-bank-0-start.patch
i2c-tegra-move-suspend-handling-to-noirq-phase.patch
block-bfq-push-up-injection-only-after-setting-servi.patch
fat-work-around-race-with-userspace-s-read-via-block.patch
pktcdvd-remove-warning-on-attempting-to-register-non.patch
hypfs-fix-error-number-left-in-struct-pointer-member.patch
tools-power-x86-intel-speed-select-fix-high-priority.patch
crypto-hisilicon-fix-double-free-in-sec_free_hw_sgl.patch
mm-add-dummy-can_do_mlock-helper.patch
kbuild-clean-compressed-initramfs-image.patch
ocfs2-wait-for-recovering-done-after-direct-unlock-r.patch
kmemleak-increase-debug_kmemleak_early_log_size-defa.patch
arm64-consider-stack-randomization-for-mmap-base-onl.patch
mips-properly-account-for-stack-randomization-and-st.patch
arm-properly-account-for-stack-randomization-and-sta.patch
arm-use-stack_top-when-computing-mmap-base-address.patch
Compile testing
---------------
We compiled the kernel for 3 architectures:
aarch64:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
ppc64le:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
x86_64:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
Hardware testing
----------------
We booted each kernel and ran the following tests:
aarch64:
Host 1:
✅ Boot test
✅ Podman system integration test (as root)
✅ Podman system integration test (as user)
✅ Loopdev Sanity
✅ jvm test suite
✅ AMTU (Abstract Machine Test Utility)
✅ audit: audit testsuite test
✅ httpd: mod_ssl smoke sanity
✅ iotop: sanity
✅ tuned: tune-processes-through-perf
✅ Usex - version 1.9-29
✅ storage: SCSI VPD
🚧 ✅ LTP lite
🚧 ✅ POSIX pjd-fstest suites
Host 2:
✅ Boot test
✅ xfstests: ext4
✅ selinux-policy: serge-testsuite
✅ lvm thinp sanity
✅ storage: software RAID testing
🚧 ✅ Storage blktests
ppc64le:
Host 1:
✅ Boot test
✅ Podman system integration test (as root)
✅ Podman system integration test (as user)
✅ Loopdev Sanity
✅ jvm test suite
✅ AMTU (Abstract Machine Test Utility)
✅ audit: audit testsuite test
✅ httpd: mod_ssl smoke sanity
✅ iotop: sanity
✅ tuned: tune-processes-through-perf
✅ Usex - version 1.9-29
🚧 ✅ LTP lite
🚧 ✅ POSIX pjd-fstest suites
Host 2:
✅ Boot test
✅ xfstests: ext4
✅ selinux-policy: serge-testsuite
✅ lvm thinp sanity
✅ storage: software RAID testing
🚧 ✅ Storage blktests
x86_64:
Host 1:
✅ Boot test
🚧 ✅ IPMI driver test
🚧 ✅ IPMItool loop stress test
Host 2:
✅ Boot test
✅ Podman system integration test (as root)
✅ Podman system integration test (as user)
✅ Loopdev Sanity
✅ jvm test suite
✅ AMTU (Abstract Machine Test Utility)
✅ audit: audit testsuite test
✅ httpd: mod_ssl smoke sanity
✅ iotop: sanity
✅ tuned: tune-processes-through-perf
✅ pciutils: sanity smoke test
✅ Usex - version 1.9-29
✅ storage: SCSI VPD
✅ stress: stress-ng
🚧 ✅ LTP lite
🚧 ✅ POSIX pjd-fstest suites
Host 3:
✅ Boot test
✅ xfstests: ext4
✅ selinux-policy: serge-testsuite
✅ lvm thinp sanity
✅ storage: software RAID testing
🚧 ✅ Storage blktests
Test sources: https://github.com/CKI-project/tests-beaker
💚 Pull requests are welcome for new tests or improvements to existing tests!
Waived tests
------------
If the test run included waived tests, they are marked with 🚧. Such tests are
executed but their results are not taken into account. Tests are waived when
their results are not reliable enough, e.g. when they're just introduced or are
being fixed.
The patch below does not apply to the 3.18-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 5c2e9f346b815841f9bed6029ebcb06415caf640 Mon Sep 17 00:00:00 2001
From: Mark Salyzyn <salyzyn(a)android.com>
Date: Thu, 29 Aug 2019 11:30:14 -0700
Subject: [PATCH] ovl: filter of trusted xattr results in audit
When filtering xattr list for reading, presence of trusted xattr
results in a security audit log. However, if there is other content
no errno will be set, and if there isn't, the errno will be -ENODATA
and not -EPERM as is usually associated with a lack of capability.
The check does not block the request to list the xattrs present.
Switch to ns_capable_noaudit to reflect a more appropriate check.
Signed-off-by: Mark Salyzyn <salyzyn(a)android.com>
Cc: linux-security-module(a)vger.kernel.org
Cc: kernel-team(a)android.com
Cc: stable(a)vger.kernel.org # v3.18+
Fixes: a082c6f680da ("ovl: filter trusted xattr for non-admin")
Signed-off-by: Miklos Szeredi <mszeredi(a)redhat.com>
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 7663aeb85fa3..bc14781886bf 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -383,7 +383,8 @@ static bool ovl_can_list(const char *s)
return true;
/* Never list trusted.overlay, list other trusted for superuser only */
- return !ovl_is_private_xattr(s) && capable(CAP_SYS_ADMIN);
+ return !ovl_is_private_xattr(s) &&
+ ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
Hi Sasha,
I think this patch shouldn't be added to the stable trees; it will
cause the TCU clock to be managed by the clock framework and
automatically gated after bootup since it has no client, causing a
global system lockup. It's only really applicable to 5.3.
Thanks,
-Paul
Le sam., oct. 5, 2019 at 19:56, Sasha Levin <sashal(a)kernel.org> a
écrit :
> This is a note to let you know that I've just added the patch titled
>
> clk: jz4740: Add TCU clock
>
> to the 4.4-stable tree which can be found at:
>
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> clk-jz4740-add-tcu-clock.patch
> and it can be found in the queue-4.4 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable
> tree,
> please let <stable(a)vger.kernel.org> know about it.
>
>
>
> commit c72b7e327a8b4d3b870f76011f674134f9ac38f6
> Author: Paul Cercueil <paul(a)crapouillou.net>
> Date: Wed Jul 24 13:16:10 2019 -0400
>
> clk: jz4740: Add TCU clock
>
> [ Upstream commit 73dd11dc1a883d4c994d729dc9984f4890001157 ]
>
> Add the missing TCU clock to the list of clocks supplied by the
> CGU for
> the JZ4740 SoC.
>
> Signed-off-by: Paul Cercueil <paul(a)crapouillou.net>
> Tested-by: Mathieu Malaterre <malat(a)debian.org>
> Tested-by: Artur Rojek <contact(a)artur-rojek.eu>
> Acked-by: Stephen Boyd <sboyd(a)kernel.org>
> Acked-by: Rob Herring <robh(a)kernel.org>
> Signed-off-by: Paul Burton <paul.burton(a)mips.com>
> Cc: Ralf Baechle <ralf(a)linux-mips.org>
> Cc: James Hogan <jhogan(a)kernel.org>
> Cc: Jonathan Corbet <corbet(a)lwn.net>
> Cc: Lee Jones <lee.jones(a)linaro.org>
> Cc: Arnd Bergmann <arnd(a)arndb.de>
> Cc: Daniel Lezcano <daniel.lezcano(a)linaro.org>
> Cc: Thomas Gleixner <tglx(a)linutronix.de>
> Cc: Michael Turquette <mturquette(a)baylibre.com>
> Cc: Jason Cooper <jason(a)lakedaemon.net>
> Cc: Marc Zyngier <marc.zyngier(a)arm.com>
> Cc: Rob Herring <robh+dt(a)kernel.org>
> Cc: Mark Rutland <mark.rutland(a)arm.com>
> Cc: devicetree(a)vger.kernel.org
> Cc: linux-kernel(a)vger.kernel.org
> Cc: linux-doc(a)vger.kernel.org
> Cc: linux-mips(a)vger.kernel.org
> Cc: linux-clk(a)vger.kernel.org
> Cc: od(a)zcrc.me
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
>
> diff --git a/drivers/clk/ingenic/jz4740-cgu.c
> b/drivers/clk/ingenic/jz4740-cgu.c
> index 305a26c2a800e..01b5b8b103888 100644
> --- a/drivers/clk/ingenic/jz4740-cgu.c
> +++ b/drivers/clk/ingenic/jz4740-cgu.c
> @@ -211,6 +211,12 @@ static const struct ingenic_cgu_clk_info
> jz4740_cgu_clocks[] = {
> .parents = { JZ4740_CLK_EXT, -1, -1, -1 },
> .gate = { CGU_REG_CLKGR, 5 },
> },
> +
> + [JZ4740_CLK_TCU] = {
> + "tcu", CGU_CLK_GATE,
> + .parents = { JZ4740_CLK_EXT, -1, -1, -1 },
> + .gate = { CGU_REG_CLKGR, 1 },
> + },
> };
>
> static void __init jz4740_cgu_init(struct device_node *np)
> diff --git a/include/dt-bindings/clock/jz4740-cgu.h
> b/include/dt-bindings/clock/jz4740-cgu.h
> index 43153d3e9bd26..ff7c27bc98e37 100644
> --- a/include/dt-bindings/clock/jz4740-cgu.h
> +++ b/include/dt-bindings/clock/jz4740-cgu.h
> @@ -33,5 +33,6 @@
> #define JZ4740_CLK_ADC 19
> #define JZ4740_CLK_I2C 20
> #define JZ4740_CLK_AIC 21
> +#define JZ4740_CLK_TCU 22
>
> #endif /* __DT_BINDINGS_CLOCK_JZ4740_CGU_H__ */
Hello,
We ran automated tests on a patchset that was proposed for merging into this
kernel tree. The patches were applied to:
Kernel repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
Commit: 40ce0335271a - Linux 5.3.4
The results of these automated tests are provided below.
Overall result: FAILED (see details below)
Merge: OK
Compile: OK
Tests: FAILED
All kernel binaries, config files, and logs are available for download here:
https://artifacts.cki-project.org/pipelines/208531
One or more kernel tests failed:
aarch64:
❌ Boot test
We hope that these logs can help you find the problem quickly. For the full
detail on our testing procedures, please scroll to the bottom of this message.
Please reply to this email if you have any questions about the tests that we
ran or if you have any suggestions on how to make future tests more effective.
,-. ,-.
( C ) ( K ) Continuous
`-',-.`-' Kernel
( I ) Integration
`-'
______________________________________________________________________________
Merge testing
-------------
We cloned this repository and checked out the following commit:
Repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
Commit: 40ce0335271a - Linux 5.3.4
We grabbed the 6a00b417bec3 commit of the stable queue repository.
We then merged the patchset with `git am`:
drm-vkms-fix-crc-worker-races.patch
drm-mcde-fix-uninitialized-variable.patch
drm-bridge-tc358767-increase-aux-transfer-length-lim.patch
drm-vkms-avoid-assigning-0-for-possible_crtc.patch
drm-panel-simple-fix-auo-g185han01-horizontal-blanki.patch
drm-amd-display-add-monitor-patch-to-add-t7-delay.patch
drm-amd-display-power-gate-all-dscs-at-driver-init-t.patch
drm-amd-display-fix-not-calling-ppsmu-to-trigger-pme.patch
drm-amd-display-clear-fec_ready-shadow-register-if-d.patch
drm-amd-display-copy-gsl-groups-when-committing-a-ne.patch
video-ssd1307fb-start-page-range-at-page_offset.patch
drm-tinydrm-kconfig-drivers-select-backlight_class_d.patch
drm-stm-attach-gem-fence-to-atomic-state.patch
drm-bridge-sii902x-fix-missing-reference-to-mclk-clo.patch
drm-panel-check-failure-cases-in-the-probe-func.patch
drm-rockchip-check-for-fast-link-training-before-ena.patch
drm-amdgpu-fix-hard-hang-for-s-g-display-bos.patch
drm-amd-display-use-proper-enum-conversion-functions.patch
drm-radeon-fix-eeh-during-kexec.patch
gpu-drm-radeon-fix-a-possible-null-pointer-dereferen.patch
clk-imx8mq-mark-ahb-clock-as-critical.patch
pci-rpaphp-avoid-a-sometimes-uninitialized-warning.patch
pinctrl-stmfx-update-pinconf-settings.patch
ipmi_si-only-schedule-continuously-in-the-thread-in-.patch
clk-qoriq-fix-wunused-const-variable.patch
clk-ingenic-jz4740-fix-pll-half-divider-not-read-wri.patch
clk-sunxi-ng-v3s-add-missing-clock-slices-for-mmc2-m.patch
drm-amd-display-fix-issue-where-252-255-values-are-c.patch
drm-amd-display-fix-frames_to_insert-math.patch
drm-amd-display-reprogram-vm-config-when-system-resu.patch
drm-amd-display-register-vupdate_no_lock-interrupts-.patch
powerpc-powernv-ioda2-allocate-tce-table-levels-on-d.patch
clk-actions-don-t-reference-clk_init_data-after-regi.patch
clk-sirf-don-t-reference-clk_init_data-after-registr.patch
clk-meson-axg-audio-don-t-reference-clk_init_data-af.patch
clk-sprd-don-t-reference-clk_init_data-after-registr.patch
clk-zx296718-don-t-reference-clk_init_data-after-reg.patch
clk-sunxi-don-t-call-clk_hw_get_name-on-a-hw-that-is.patch
powerpc-xmon-check-for-hv-mode-when-dumping-xive-inf.patch
powerpc-rtas-use-device-model-apis-and-serialization.patch
powerpc-ptdump-fix-walk_pagetables-address-mismatch.patch
powerpc-futex-fix-warning-oldval-may-be-used-uniniti.patch
powerpc-64s-radix-fix-memory-hotplug-section-page-ta.patch
powerpc-pseries-mobility-use-cond_resched-when-updat.patch
powerpc-perf-fix-imc-allocation-failure-handling.patch
pinctrl-tegra-fix-write-barrier-placement-in-pmx_wri.patch
powerpc-eeh-clear-stale-eeh_dev_no_handler-flag.patch
vfio_pci-restore-original-state-on-release.patch
drm-amdgpu-sdma5-fix-number-of-sdma5-trap-irq-types-.patch
drm-nouveau-kms-tu102-disable-input-lut-when-input-i.patch
drm-nouveau-volt-fix-for-some-cards-having-0-maximum.patch
pinctrl-amd-disable-spurious-firing-gpio-irqs.patch
clk-renesas-mstp-set-genpd_flag_always_on-for-clock-.patch
clk-renesas-cpg-mssr-set-genpd_flag_always_on-for-cl.patch
drm-amd-display-support-spdif.patch
drm-amd-powerpaly-fix-navi-series-custom-peak-level-.patch
drm-amd-display-fix-mpo-hubp-underflow-with-scatter-.patch
drm-amd-display-fix-trigger-not-generated-for-freesy.patch
selftests-powerpc-retry-on-host-facility-unavailable.patch
kbuild-do-not-enable-wimplicit-fallthrough-for-clang.patch
drm-amdgpu-si-fix-asic-tests.patch
powerpc-64s-exception-machine-check-use-correct-cfar.patch
pstore-fs-superblock-limits.patch
powerpc-eeh-clean-up-eeh-pes-after-recovery-finishes.patch
clk-qcom-gcc-sdm845-use-floor-ops-for-sdcc-clks.patch
powerpc-pseries-correctly-track-irq-state-in-default.patch
pinctrl-meson-gxbb-fix-wrong-pinning-definition-for-.patch
mailbox-mediatek-cmdq-clear-the-event-in-cmdq-initia.patch
arm-dts-dir685-drop-spi-cpol-from-the-display.patch
arm64-fix-unreachable-code-issue-with-cmpxchg.patch
clk-at91-select-parent-if-main-oscillator-or-bypass-.patch
clk-imx-pll14xx-avoid-glitch-when-set-rate.patch
clk-imx-clk-pll14xx-unbypass-pll-by-default.patch
clk-make-clk_bulk_get_all-return-a-valid-id.patch
powerpc-dump-kernel-log-before-carrying-out-fadump-o.patch
mbox-qcom-add-apcs-child-device-for-qcs404.patch
clk-sprd-add-missing-kfree.patch
scsi-core-reduce-memory-required-for-scsi-logging.patch
dma-buf-sw_sync-synchronize-signal-vs-syncpt-free.patch
f2fs-fix-to-drop-meta-node-pages-during-umount.patch
ext4-fix-potential-use-after-free-after-remounting-w.patch
mips-ingenic-disable-broken-btb-lookup-optimization.patch
clk-jz4740-add-tcu-clock.patch
mips-don-t-use-bc_false-uninitialized-in-__mm_isbran.patch
mips-tlbex-explicitly-cast-_page_no_exec-to-a-boolea.patch
i2c-cht-wc-fix-lockdep-warning.patch
mfd-intel-lpss-remove-d3cold-delay.patch
pci-tegra-fix-of-node-reference-leak.patch
hid-wacom-fix-several-minor-compiler-warnings.patch
rtc-bd70528-fix-driver-dependencies.patch
mips-atomic-fix-loongson_llsc_mb-wreckage.patch
pci-pci-hyperv-fix-build-errors-on-non-sysfs-config.patch
pci-layerscape-add-the-bar_fixed_64bit-property-to-t.patch
livepatch-nullify-obj-mod-in-klp_module_coming-s-err.patch
mips-atomic-fix-smp_mb__-before-after-_atomic.patch
arm-8898-1-mm-don-t-treat-faults-reported-from-cache.patch
soundwire-intel-fix-channel-number-reported-by-hardw.patch
pci-mobiveil-fix-the-cpu-base-address-setup-in-inbou.patch
arm-8875-1-kconfig-default-to-aeabi-w-clang.patch
rtc-snvs-fix-possible-race-condition.patch
rtc-pcf85363-pcf85263-fix-regmap-error-in-set_time.patch
power-supply-register-hwmon-devices-with-valid-names.patch
selinux-fix-residual-uses-of-current_security-for-th.patch
pci-add-pci_info_ratelimited-to-ratelimit-pci-separa.patch
hid-apple-fix-stuck-function-keys-when-using-fn.patch
pci-rockchip-propagate-errors-for-optional-regulator.patch
pci-histb-propagate-errors-for-optional-regulators.patch
pci-imx6-propagate-errors-for-optional-regulators.patch
pci-exynos-propagate-errors-for-optional-phys.patch
security-smack-fix-possible-null-pointer-dereference.patch
pci-use-static-const-struct-not-const-static-struct.patch
arm-8905-1-emit-__gnu_mcount_nc-when-using-clang-10..patch
arm-8903-1-ensure-that-usable-memory-in-bank-0-start.patch
i2c-tegra-move-suspend-handling-to-noirq-phase.patch
block-bfq-push-up-injection-only-after-setting-servi.patch
fat-work-around-race-with-userspace-s-read-via-block.patch
pktcdvd-remove-warning-on-attempting-to-register-non.patch
hypfs-fix-error-number-left-in-struct-pointer-member.patch
tools-power-x86-intel-speed-select-fix-high-priority.patch
crypto-hisilicon-fix-double-free-in-sec_free_hw_sgl.patch
mm-add-dummy-can_do_mlock-helper.patch
kbuild-clean-compressed-initramfs-image.patch
ocfs2-wait-for-recovering-done-after-direct-unlock-r.patch
kmemleak-increase-debug_kmemleak_early_log_size-defa.patch
arm64-consider-stack-randomization-for-mmap-base-onl.patch
mips-properly-account-for-stack-randomization-and-st.patch
arm-properly-account-for-stack-randomization-and-sta.patch
arm-use-stack_top-when-computing-mmap-base-address.patch
Compile testing
---------------
We compiled the kernel for 3 architectures:
aarch64:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
ppc64le:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
x86_64:
make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg
Hardware testing
----------------
We booted each kernel and ran the following tests:
aarch64:
Host 1:
✅ Boot test
✅ Podman system integration test (as root)
✅ Podman system integration test (as user)
✅ Loopdev Sanity
✅ jvm test suite
✅ AMTU (Abstract Machine Test Utility)
✅ audit: audit testsuite test
✅ httpd: mod_ssl smoke sanity
✅ iotop: sanity
✅ tuned: tune-processes-through-perf
✅ Usex - version 1.9-29
✅ storage: SCSI VPD
✅ stress: stress-ng
🚧 ✅ LTP lite
🚧 ✅ POSIX pjd-fstest suites
Host 2:
❌ Boot test
⚡⚡⚡ xfstests: ext4
⚡⚡⚡ selinux-policy: serge-testsuite
⚡⚡⚡ lvm thinp sanity
⚡⚡⚡ storage: software RAID testing
🚧 ⚡⚡⚡ Storage blktests
ppc64le:
Host 1:
✅ Boot test
✅ Podman system integration test (as root)
✅ Podman system integration test (as user)
✅ Loopdev Sanity
✅ jvm test suite
✅ AMTU (Abstract Machine Test Utility)
✅ audit: audit testsuite test
✅ httpd: mod_ssl smoke sanity
✅ iotop: sanity
✅ tuned: tune-processes-through-perf
✅ Usex - version 1.9-29
🚧 ✅ LTP lite
🚧 ✅ POSIX pjd-fstest suites
Host 2:
✅ Boot test
✅ xfstests: ext4
✅ selinux-policy: serge-testsuite
✅ lvm thinp sanity
✅ storage: software RAID testing
🚧 ✅ Storage blktests
x86_64:
Host 1:
✅ Boot test
✅ Podman system integration test (as root)
✅ Podman system integration test (as user)
✅ Loopdev Sanity
✅ jvm test suite
✅ AMTU (Abstract Machine Test Utility)
✅ audit: audit testsuite test
✅ httpd: mod_ssl smoke sanity
✅ iotop: sanity
✅ tuned: tune-processes-through-perf
✅ pciutils: sanity smoke test
✅ Usex - version 1.9-29
✅ storage: SCSI VPD
✅ stress: stress-ng
🚧 ✅ LTP lite
🚧 ✅ POSIX pjd-fstest suites
Host 2:
✅ Boot test
✅ xfstests: ext4
✅ selinux-policy: serge-testsuite
✅ lvm thinp sanity
✅ storage: software RAID testing
🚧 ✅ Storage blktests
Host 3:
✅ Boot test
🚧 ✅ IPMI driver test
🚧 ✅ IPMItool loop stress test
Test sources: https://github.com/CKI-project/tests-beaker
💚 Pull requests are welcome for new tests or improvements to existing tests!
Waived tests
------------
If the test run included waived tests, they are marked with 🚧. Such tests are
executed but their results are not taken into account. Tests are waived when
their results are not reliable enough, e.g. when they're just introduced or are
being fixed.
From: Daniel Borkmann <daniel(a)iogearbox.net>
commit c751798aa224fadc5124b49eeb38fb468c0fa039 upstream.
syzkaller managed to trigger the warning in bpf_jit_free() which checks via
bpf_prog_kallsyms_verify_off() for potentially unlinked JITed BPF progs
in kallsyms, and subsequently trips over GPF when walking kallsyms entries:
[...]
8021q: adding VLAN 0 to HW filter on device batadv0
8021q: adding VLAN 0 to HW filter on device batadv0
WARNING: CPU: 0 PID: 9869 at kernel/bpf/core.c:810 bpf_jit_free+0x1e8/0x2a0
Kernel panic - not syncing: panic_on_warn set ...
CPU: 0 PID: 9869 Comm: kworker/0:7 Not tainted 5.0.0-rc8+ #1
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: events bpf_prog_free_deferred
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x113/0x167 lib/dump_stack.c:113
panic+0x212/0x40b kernel/panic.c:214
__warn.cold.8+0x1b/0x38 kernel/panic.c:571
report_bug+0x1a4/0x200 lib/bug.c:186
fixup_bug arch/x86/kernel/traps.c:178 [inline]
do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:271
do_invalid_op+0x36/0x40 arch/x86/kernel/traps.c:290
invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:973
RIP: 0010:bpf_jit_free+0x1e8/0x2a0
Code: 02 4c 89 e2 83 e2 07 38 d0 7f 08 84 c0 0f 85 86 00 00 00 48 ba 00 02 00 00 00 00 ad de 0f b6 43 02 49 39 d6 0f 84 5f fe ff ff <0f> 0b e9 58 fe ff ff 48 b8 00 00 00 00 00 fc ff df 4c 89 e2 48 c1
RSP: 0018:ffff888092f67cd8 EFLAGS: 00010202
RAX: 0000000000000007 RBX: ffffc90001947000 RCX: ffffffff816e9d88
RDX: dead000000000200 RSI: 0000000000000008 RDI: ffff88808769f7f0
RBP: ffff888092f67d00 R08: fffffbfff1394059 R09: fffffbfff1394058
R10: fffffbfff1394058 R11: ffffffff89ca02c7 R12: ffffc90001947002
R13: ffffc90001947020 R14: ffffffff881eca80 R15: ffff88808769f7e8
BUG: unable to handle kernel paging request at fffffbfff400d000
#PF error: [normal kernel read fault]
PGD 21ffee067 P4D 21ffee067 PUD 21ffed067 PMD 9f942067 PTE 0
Oops: 0000 [#1] PREEMPT SMP KASAN
CPU: 0 PID: 9869 Comm: kworker/0:7 Not tainted 5.0.0-rc8+ #1
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: events bpf_prog_free_deferred
RIP: 0010:bpf_get_prog_addr_region kernel/bpf/core.c:495 [inline]
RIP: 0010:bpf_tree_comp kernel/bpf/core.c:558 [inline]
RIP: 0010:__lt_find include/linux/rbtree_latch.h:115 [inline]
RIP: 0010:latch_tree_find include/linux/rbtree_latch.h:208 [inline]
RIP: 0010:bpf_prog_kallsyms_find+0x107/0x2e0 kernel/bpf/core.c:632
Code: 00 f0 ff ff 44 38 c8 7f 08 84 c0 0f 85 fa 00 00 00 41 f6 45 02 01 75 02 0f 0b 48 39 da 0f 82 92 00 00 00 48 89 d8 48 c1 e8 03 <42> 0f b6 04 30 84 c0 74 08 3c 03 0f 8e 45 01 00 00 8b 03 48 c1 e0
[...]
Upon further debugging, it turns out that whenever we trigger this
issue, the kallsyms removal in bpf_prog_ksym_node_del() was /skipped/
but yet bpf_jit_free() reported that the entry is /in use/.
Problem is that symbol exposure via bpf_prog_kallsyms_add() but also
perf_event_bpf_event() were done /after/ bpf_prog_new_fd(). Once the
fd is exposed to the public, a parallel close request came in right
before we attempted to do the bpf_prog_kallsyms_add().
Given at this time the prog reference count is one, we start to rip
everything underneath us via bpf_prog_release() -> bpf_prog_put().
The memory is eventually released via deferred free, so we're seeing
that bpf_jit_free() has a kallsym entry because we added it from
bpf_prog_load() but /after/ bpf_prog_put() from the remote CPU.
Therefore, move both notifications /before/ we install the fd. The
issue was never seen between bpf_prog_alloc_id() and bpf_prog_new_fd()
because upon bpf_prog_get_fd_by_id() we'll take another reference to
the BPF prog, so we're still holding the original reference from the
bpf_prog_load().
Fixes: 6ee52e2a3fe4 ("perf, bpf: Introduce PERF_RECORD_BPF_EVENT")
Fixes: 74451e66d516 ("bpf: make jited programs visible in traces")
Reported-by: syzbot+bd3bba6ff3fcea7a6ec6(a)syzkaller.appspotmail.com
Signed-off-by: Daniel Borkmann <daniel(a)iogearbox.net>
Cc: Song Liu <songliubraving(a)fb.com>
Signed-off-by: Zubin Mithra <zsm(a)chromium.org>
---
Notes:
* Syzkaller triggered a WARNING on 4.14 kernels with the following
stacktrace:
Call Trace:
dump_stack+0x81/0xb3
panic+0x14a/0x2ad
? refcount_error_report+0xf6/0xf6
? set_fs+0x1a/0x29
? bpf_jit_free+0x8b/0xce
__warn+0xde/0x112
? bpf_jit_free+0x8b/0xce
report_bug+0x91/0xda
fixup_bug+0x2c/0x4c
do_error_trap+0xda/0x192
? fixup_bug+0x4c/0x4c
? hlock_class+0x6d/0x8b
? mark_lock+0x3a/0x26d
? trace_hardirqs_off_caller+0xf2/0xfb
? trace_hardirqs_off_thunk+0x1a/0x1c
invalid_op+0x1b/0x40
? bpf_jit_binary_free+0x15/0x20
? bpf_jit_free+0x7b/0xce
process_one_work+0x484/0x793
? wq_calc_node_cpumask.constprop.37+0x25/0x25
? worker_clr_flags+0x52/0x88
worker_thread+0x2b8/0x3d1
? rescuer_thread+0x425/0x425
kthread+0x192/0x1a2
? __list_del_entry+0x41/0x41
ret_from_fork+0x3a/0x50
* The commit is not present in linux-4.19.y. A backport has been sent
separately.
* The patch resolves conflicts inside bpf_prog_load that arise due to
trace_bpf_prog_load() not being present upstream when c751798aa224 was
applied and perf_event_bpf_event() not being present in linux-4.14.y.
* Tests run: Chrome OS tryjobs, Syzkaller reproducer
kernel/bpf/syscall.c | 30 ++++++++++++++++++------------
1 file changed, 18 insertions(+), 12 deletions(-)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2d828d3469822..59d2e94ecb798 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1067,20 +1067,26 @@ static int bpf_prog_load(union bpf_attr *attr)
if (err)
goto free_used_maps;
- err = bpf_prog_new_fd(prog);
- if (err < 0) {
- /* failed to allocate fd.
- * bpf_prog_put() is needed because the above
- * bpf_prog_alloc_id() has published the prog
- * to the userspace and the userspace may
- * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID.
- */
- bpf_prog_put(prog);
- return err;
- }
-
+ /* Upon success of bpf_prog_alloc_id(), the BPF prog is
+ * effectively publicly exposed. However, retrieving via
+ * bpf_prog_get_fd_by_id() will take another reference,
+ * therefore it cannot be gone underneath us.
+ *
+ * Only for the time /after/ successful bpf_prog_new_fd()
+ * and before returning to userspace, we might just hold
+ * one reference and any parallel close on that fd could
+ * rip everything out. Hence, below notifications must
+ * happen before bpf_prog_new_fd().
+ *
+ * Also, any failure handling from this point onwards must
+ * be using bpf_prog_put() given the program is exposed.
+ */
bpf_prog_kallsyms_add(prog);
trace_bpf_prog_load(prog, err);
+
+ err = bpf_prog_new_fd(prog);
+ if (err < 0)
+ bpf_prog_put(prog);
return err;
free_used_maps:
--
2.23.0.444.g18eeb5a265-goog
[ Upstream commit cb8acabbe33b110157955a7425ee876fb81e6bbc ]
Commit 7211aef86f79 ("block: mq-deadline: Fix write completion
handling") added a call to blk_mq_sched_mark_restart_hctx() in
dd_dispatch_request() to make sure that write request dispatching does
not stall when all target zones are locked. This fix left a subtle race
when a write completion happens during a dispatch execution on another
CPU:
CPU 0: Dispatch CPU1: write completion
dd_dispatch_request()
lock(&dd->lock);
...
lock(&dd->zone_lock); dd_finish_request()
rq = find request lock(&dd->zone_lock);
unlock(&dd->zone_lock);
zone write unlock
unlock(&dd->zone_lock);
...
__blk_mq_free_request
check restart flag (not set)
-> queue not run
...
if (!rq && have writes)
blk_mq_sched_mark_restart_hctx()
unlock(&dd->lock)
Since the dispatch context finishes after the write request completion
handling, marking the queue as needing a restart is not seen from
__blk_mq_free_request() and blk_mq_sched_restart() not executed leading
to the dispatch stall under 100% write workloads.
Fix this by moving the call to blk_mq_sched_mark_restart_hctx() from
dd_dispatch_request() into dd_finish_request() under the zone lock to
ensure full mutual exclusion between write request dispatch selection
and zone unlock on write request completion.
Fixes: 7211aef86f79 ("block: mq-deadline: Fix write completion handling")
Cc: stable(a)vger.kernel.org
Reported-by: Hans Holmberg <Hans.Holmberg(a)wdc.com>
Reviewed-by: Hans Holmberg <hans.holmberg(a)wdc.com>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Signed-off-by: Damien Le Moal <damien.lemoal(a)wdc.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
---
block/mq-deadline.c | 23 +++++++++++++----------
1 file changed, 13 insertions(+), 10 deletions(-)
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index d5e21ce44d2c..69094d641062 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -376,13 +376,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd)
* hardware queue, but we may return a request that is for a
* different hardware queue. This is because mq-deadline has shared
* state for all hardware queues, in terms of sorting, FIFOs, etc.
- *
- * For a zoned block device, __dd_dispatch_request() may return NULL
- * if all the queued write requests are directed at zones that are already
- * locked due to on-going write requests. In this case, make sure to mark
- * the queue as needing a restart to ensure that the queue is run again
- * and the pending writes dispatched once the target zones for the ongoing
- * write requests are unlocked in dd_finish_request().
*/
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
@@ -391,9 +384,6 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
spin_lock(&dd->lock);
rq = __dd_dispatch_request(dd);
- if (!rq && blk_queue_is_zoned(hctx->queue) &&
- !list_empty(&dd->fifo_list[WRITE]))
- blk_mq_sched_mark_restart_hctx(hctx);
spin_unlock(&dd->lock);
return rq;
@@ -559,6 +549,13 @@ static void dd_prepare_request(struct request *rq, struct bio *bio)
* spinlock so that the zone is never unlocked while deadline_fifo_request()
* or deadline_next_request() are executing. This function is called for
* all requests, whether or not these requests complete successfully.
+ *
+ * For a zoned block device, __dd_dispatch_request() may have stopped
+ * dispatching requests if all the queued requests are write requests directed
+ * at zones that are already locked due to on-going write requests. To ensure
+ * write request dispatch progress in this case, mark the queue as needing a
+ * restart to ensure that the queue is run again after completion of the
+ * request and zones being unlocked.
*/
static void dd_finish_request(struct request *rq)
{
@@ -570,6 +567,12 @@ static void dd_finish_request(struct request *rq)
spin_lock_irqsave(&dd->zone_lock, flags);
blk_req_zone_write_unlock(rq);
+ if (!list_empty(&dd->fifo_list[WRITE])) {
+ struct blk_mq_hw_ctx *hctx;
+
+ hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
+ blk_mq_sched_mark_restart_hctx(hctx);
+ }
spin_unlock_irqrestore(&dd->zone_lock, flags);
}
}
--
2.21.0
The patch titled
Subject: mm/slub: fix a deadlock in show_slab_objects()
has been added to the -mm tree. Its filename is
mm-slub-fix-a-deadlock-in-show_slab_objects.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/mm-slub-fix-a-deadlock-in-show_sla…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/mm-slub-fix-a-deadlock-in-show_sla…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Qian Cai <cai(a)lca.pw>
Subject: mm/slub: fix a deadlock in show_slab_objects()
A long time ago we fixed a similar deadlock in show_slab_objects() [1].
However, it is apparently due to the commits like 01fb58bcba63 ("slab:
remove synchronous synchronize_sched() from memcg cache deactivation
path") and 03afc0e25f7f ("slab: get_online_mems for
kmem_cache_{create,destroy,shrink}"), this kind of deadlock is back by
just reading files in /sys/kernel/slab which will generate a lockdep splat
below.
Since the "mem_hotplug_lock" here is only to obtain a stable online node
mask while racing with NUMA node hotplug, in the worst case, the results
may me miscalculated while doing NUMA node hotplug, but they shall be
corrected by later reads of the same files.
WARNING: possible circular locking dependency detected
------------------------------------------------------
cat/5224 is trying to acquire lock:
ffff900012ac3120 (mem_hotplug_lock.rw_sem){++++}, at:
show_slab_objects+0x94/0x3a8
but task is already holding lock:
b8ff009693eee398 (kn->count#45){++++}, at: kernfs_seq_start+0x44/0xf0
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (kn->count#45){++++}:
lock_acquire+0x31c/0x360
__kernfs_remove+0x290/0x490
kernfs_remove+0x30/0x44
sysfs_remove_dir+0x70/0x88
kobject_del+0x50/0xb0
sysfs_slab_unlink+0x2c/0x38
shutdown_cache+0xa0/0xf0
kmemcg_cache_shutdown_fn+0x1c/0x34
kmemcg_workfn+0x44/0x64
process_one_work+0x4f4/0x950
worker_thread+0x390/0x4bc
kthread+0x1cc/0x1e8
ret_from_fork+0x10/0x18
-> #1 (slab_mutex){+.+.}:
lock_acquire+0x31c/0x360
__mutex_lock_common+0x16c/0xf78
mutex_lock_nested+0x40/0x50
memcg_create_kmem_cache+0x38/0x16c
memcg_kmem_cache_create_func+0x3c/0x70
process_one_work+0x4f4/0x950
worker_thread+0x390/0x4bc
kthread+0x1cc/0x1e8
ret_from_fork+0x10/0x18
-> #0 (mem_hotplug_lock.rw_sem){++++}:
validate_chain+0xd10/0x2bcc
__lock_acquire+0x7f4/0xb8c
lock_acquire+0x31c/0x360
get_online_mems+0x54/0x150
show_slab_objects+0x94/0x3a8
total_objects_show+0x28/0x34
slab_attr_show+0x38/0x54
sysfs_kf_seq_show+0x198/0x2d4
kernfs_seq_show+0xa4/0xcc
seq_read+0x30c/0x8a8
kernfs_fop_read+0xa8/0x314
__vfs_read+0x88/0x20c
vfs_read+0xd8/0x10c
ksys_read+0xb0/0x120
__arm64_sys_read+0x54/0x88
el0_svc_handler+0x170/0x240
el0_svc+0x8/0xc
other info that might help us debug this:
Chain exists of:
mem_hotplug_lock.rw_sem --> slab_mutex --> kn->count#45
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(kn->count#45);
lock(slab_mutex);
lock(kn->count#45);
lock(mem_hotplug_lock.rw_sem);
*** DEADLOCK ***
3 locks held by cat/5224:
#0: 9eff00095b14b2a0 (&p->lock){+.+.}, at: seq_read+0x4c/0x8a8
#1: 0eff008997041480 (&of->mutex){+.+.}, at: kernfs_seq_start+0x34/0xf0
#2: b8ff009693eee398 (kn->count#45){++++}, at:
kernfs_seq_start+0x44/0xf0
stack backtrace:
Call trace:
dump_backtrace+0x0/0x248
show_stack+0x20/0x2c
dump_stack+0xd0/0x140
print_circular_bug+0x368/0x380
check_noncircular+0x248/0x250
validate_chain+0xd10/0x2bcc
__lock_acquire+0x7f4/0xb8c
lock_acquire+0x31c/0x360
get_online_mems+0x54/0x150
show_slab_objects+0x94/0x3a8
total_objects_show+0x28/0x34
slab_attr_show+0x38/0x54
sysfs_kf_seq_show+0x198/0x2d4
kernfs_seq_show+0xa4/0xcc
seq_read+0x30c/0x8a8
kernfs_fop_read+0xa8/0x314
__vfs_read+0x88/0x20c
vfs_read+0xd8/0x10c
ksys_read+0xb0/0x120
__arm64_sys_read+0x54/0x88
el0_svc_handler+0x170/0x240
el0_svc+0x8/0xc
[1] http://lkml.iu.edu/hypermail/linux/kernel/1101.0/02850.html
Link: http://lkml.kernel.org/r/1570192309-10132-1-git-send-email-cai@lca.pw
Fixes: 01fb58bcba63 ("slab: remove synchronous synchronize_sched() from memcg cache deactivation path")
Fixes: 03afc0e25f7f ("slab: get_online_mems for kmem_cache_{create,destroy,shrink}")
Signed-off-by: Qian Cai <cai(a)lca.pw>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Christoph Lameter <cl(a)linux.com>
Cc: Pekka Enberg <penberg(a)kernel.org>
Cc: David Rientjes <rientjes(a)google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim(a)lge.com>
Cc: Tejun Heo <tj(a)kernel.org>
Cc: Vladimir Davydov <vdavydov.dev(a)gmail.com>
Cc: Roman Gushchin <guro(a)fb.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/slub.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
--- a/mm/slub.c~mm-slub-fix-a-deadlock-in-show_slab_objects
+++ a/mm/slub.c
@@ -4846,7 +4846,13 @@ static ssize_t show_slab_objects(struct
}
}
- get_online_mems();
+ /*
+ * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
+ * already held which will conflict with an existing lock order:
+ *
+ * mem_hotplug_lock->slab_mutex->kernfs_mutex
+ */
+
#ifdef CONFIG_SLUB_DEBUG
if (flags & SO_ALL) {
struct kmem_cache_node *n;
@@ -4887,7 +4893,6 @@ static ssize_t show_slab_objects(struct
x += sprintf(buf + x, " N%d=%lu",
node, nodes[node]);
#endif
- put_online_mems();
kfree(nodes);
return x + sprintf(buf + x, "\n");
}
_
Patches currently in -mm which might be from cai(a)lca.pw are
mm-page_alloc-fix-a-crash-in-free_pages_prepare.patch
mm-slub-fix-a-deadlock-in-show_slab_objects.patch
Since commit 9bcf15f75cac ("iio: adc: axp288: Fix TS-pin handling") we
preserve the bias current set by the firmware at boot. This fixes issues
we were seeing on various models, but it seems our old hardcoded 80ųA bias
current was working around a firmware bug on at least one model laptop.
In order to both have our cake and eat it, this commit adds a dmi based
list of models where we need to override the firmware set bias current and
adds the one model we now know needs this to it: The Lenovo Ideapad 100S
(11 inch version).
Cc: stable(a)vger.kernel.org
Fixes: 9bcf15f75cac ("iio: adc: axp288: Fix TS-pin handling")
BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=203829
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
---
drivers/iio/adc/axp288_adc.c | 32 ++++++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)
diff --git a/drivers/iio/adc/axp288_adc.c b/drivers/iio/adc/axp288_adc.c
index 31d51bcc5f2c..85d08e68b34f 100644
--- a/drivers/iio/adc/axp288_adc.c
+++ b/drivers/iio/adc/axp288_adc.c
@@ -7,6 +7,7 @@
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
+#include <linux/dmi.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/device.h>
@@ -25,6 +26,11 @@
#define AXP288_ADC_EN_MASK 0xF0
#define AXP288_ADC_TS_ENABLE 0x01
+#define AXP288_ADC_TS_BIAS_MASK GENMASK(5, 4)
+#define AXP288_ADC_TS_BIAS_20UA (0 << 4)
+#define AXP288_ADC_TS_BIAS_40UA (1 << 4)
+#define AXP288_ADC_TS_BIAS_60UA (2 << 4)
+#define AXP288_ADC_TS_BIAS_80UA (3 << 4)
#define AXP288_ADC_TS_CURRENT_ON_OFF_MASK GENMASK(1, 0)
#define AXP288_ADC_TS_CURRENT_OFF (0 << 0)
#define AXP288_ADC_TS_CURRENT_ON_WHEN_CHARGING (1 << 0)
@@ -177,10 +183,36 @@ static int axp288_adc_read_raw(struct iio_dev *indio_dev,
return ret;
}
+/*
+ * We rely on the machine's firmware to correctly setup the TS pin bias current
+ * at boot. This lists systems with broken fw where we need to set it ourselves.
+ */
+static const struct dmi_system_id axp288_adc_ts_bias_override[] = {
+ {
+ /* Lenovo Ideapad 100S (11 inch) */
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ DMI_MATCH(DMI_PRODUCT_VERSION, "Lenovo ideapad 100S-11IBY"),
+ },
+ .driver_data = (void *)(uintptr_t)AXP288_ADC_TS_BIAS_80UA,
+ },
+ {}
+};
+
static int axp288_adc_initialize(struct axp288_adc_info *info)
{
+ const struct dmi_system_id *bias_override;
int ret, adc_enable_val;
+ bias_override = dmi_first_match(axp288_adc_ts_bias_override);
+ if (bias_override) {
+ ret = regmap_update_bits(info->regmap, AXP288_ADC_TS_PIN_CTRL,
+ AXP288_ADC_TS_BIAS_MASK,
+ (uintptr_t)bias_override->driver_data);
+ if (ret)
+ return ret;
+ }
+
/*
* Determine if the TS pin is enabled and set the TS current-source
* accordingly.
--
2.23.0
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From cb8acabbe33b110157955a7425ee876fb81e6bbc Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal(a)wdc.com>
Date: Wed, 28 Aug 2019 13:40:20 +0900
Subject: [PATCH] block: mq-deadline: Fix queue restart handling
Commit 7211aef86f79 ("block: mq-deadline: Fix write completion
handling") added a call to blk_mq_sched_mark_restart_hctx() in
dd_dispatch_request() to make sure that write request dispatching does
not stall when all target zones are locked. This fix left a subtle race
when a write completion happens during a dispatch execution on another
CPU:
CPU 0: Dispatch CPU1: write completion
dd_dispatch_request()
lock(&dd->lock);
...
lock(&dd->zone_lock); dd_finish_request()
rq = find request lock(&dd->zone_lock);
unlock(&dd->zone_lock);
zone write unlock
unlock(&dd->zone_lock);
...
__blk_mq_free_request
check restart flag (not set)
-> queue not run
...
if (!rq && have writes)
blk_mq_sched_mark_restart_hctx()
unlock(&dd->lock)
Since the dispatch context finishes after the write request completion
handling, marking the queue as needing a restart is not seen from
__blk_mq_free_request() and blk_mq_sched_restart() not executed leading
to the dispatch stall under 100% write workloads.
Fix this by moving the call to blk_mq_sched_mark_restart_hctx() from
dd_dispatch_request() into dd_finish_request() under the zone lock to
ensure full mutual exclusion between write request dispatch selection
and zone unlock on write request completion.
Fixes: 7211aef86f79 ("block: mq-deadline: Fix write completion handling")
Cc: stable(a)vger.kernel.org
Reported-by: Hans Holmberg <Hans.Holmberg(a)wdc.com>
Reviewed-by: Hans Holmberg <hans.holmberg(a)wdc.com>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Signed-off-by: Damien Le Moal <damien.lemoal(a)wdc.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 2a2a2e82832e..35e84bc0ec8c 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -377,13 +377,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd)
* hardware queue, but we may return a request that is for a
* different hardware queue. This is because mq-deadline has shared
* state for all hardware queues, in terms of sorting, FIFOs, etc.
- *
- * For a zoned block device, __dd_dispatch_request() may return NULL
- * if all the queued write requests are directed at zones that are already
- * locked due to on-going write requests. In this case, make sure to mark
- * the queue as needing a restart to ensure that the queue is run again
- * and the pending writes dispatched once the target zones for the ongoing
- * write requests are unlocked in dd_finish_request().
*/
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
@@ -392,9 +385,6 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
spin_lock(&dd->lock);
rq = __dd_dispatch_request(dd);
- if (!rq && blk_queue_is_zoned(hctx->queue) &&
- !list_empty(&dd->fifo_list[WRITE]))
- blk_mq_sched_mark_restart_hctx(hctx);
spin_unlock(&dd->lock);
return rq;
@@ -561,6 +551,13 @@ static void dd_prepare_request(struct request *rq, struct bio *bio)
* spinlock so that the zone is never unlocked while deadline_fifo_request()
* or deadline_next_request() are executing. This function is called for
* all requests, whether or not these requests complete successfully.
+ *
+ * For a zoned block device, __dd_dispatch_request() may have stopped
+ * dispatching requests if all the queued requests are write requests directed
+ * at zones that are already locked due to on-going write requests. To ensure
+ * write request dispatch progress in this case, mark the queue as needing a
+ * restart to ensure that the queue is run again after completion of the
+ * request and zones being unlocked.
*/
static void dd_finish_request(struct request *rq)
{
@@ -572,6 +569,8 @@ static void dd_finish_request(struct request *rq)
spin_lock_irqsave(&dd->zone_lock, flags);
blk_req_zone_write_unlock(rq);
+ if (!list_empty(&dd->fifo_list[WRITE]))
+ blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
spin_unlock_irqrestore(&dd->zone_lock, flags);
}
}
From: Will Deacon <will(a)kernel.org>
Closing /dev/pts/ptmx removes the corresponding pty under /dev/pts/
without synchronizing against concurrent path walkers. This can lead to
'dcache_readdir()' tripping over a 'struct dentry' with a NULL 'd_inode'
field:
| BUG: kernel NULL pointer dereference, address: 0000000000000000
| #PF: supervisor read access in kernel mode
| #PF: error_code(0x0000) - not-present page
| PGD 0 P4D 0
| SMP PTI
| CPU: 9 PID: 179 Comm: ptmx Not tainted 5.4.0-rc1+ #5
| RIP: 0010:dcache_readdir+0xe1/0x150
| Code: 48 83 f8 01 74 a2 48 83 f8 02 74 eb 4d 8d a7 90 00 00 00 45 31 f6 eb 42 48 8b 43 30 48 8b 4d 08 48 89 ef 8b 53 24 48 8b 73 28 <44> 0f b7 08 4c 8b 55 00 4c 8b 40 40 66 41 c1 e9 0c 41 83 e1 0f e8
| RSP: 0018:ffffa7df8044be58 EFLAGS: 00010286
| RAX: 0000000000000000 RBX: ffff9511c78f3ec0 RCX: 0000000000000002
| RDX: 0000000000000001 RSI: ffff9511c78f3ef8 RDI: ffffa7df8044bed0
| RBP: ffffa7df8044bed0 R08: 0000000000000000 R09: 00000000004bc478
| R10: ffff9511c877c6a8 R11: ffff9511c8dde600 R12: ffff9511c878c460
| R13: ffff9511c878c3c0 R14: 0000000000000000 R15: ffff9511c9442cc0
| FS: 00007fc5ea2e1700(0000) GS:ffff9511ca280000(0000) knlGS:0000000000000000
| CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
| CR2: 0000000000000000 CR3: 0000000047d68002 CR4: 0000000000760ea0
| DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
| DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
| PKRU: 55555554
| Call Trace:
| iterate_dir+0x137/0x190
| ksys_getdents64+0x97/0x130
| ? iterate_dir+0x190/0x190
| __x64_sys_getdents64+0x11/0x20
| do_syscall_64+0x43/0x110
| entry_SYSCALL_64_after_hwframe+0x44/0xa9
In this case, one CPU is deleting the dentry and clearing the inode
pointer via:
devpts_pty_kill()
-> dput()
-> dentry_kill()
-> __dentry_kill()
-> dentry_unlink_inode()
whilst the other is traversing the directory an obtaining a reference
to the dentry being deleted via:
sys_getdents64()
-> iterate_dir()
-> dcache_readdir()
-> next_positive()
Prevent the race by acquiring the inode lock of the parent in
'devpts_pty_kill()' so that path walkers are held off until the dentry
has been completely torn down.
Will's fix didn't link to the commit it fixes so I tracked it down.
devpts_pty_kill() used to take inode_lock() before removing a pts
device. The inode_lock() got removed in
8ead9dd54716 ("devpts: more pty driver interface cleanups"). The
reasoning behind the removal seemed to be that the inode_lock() was only
needed because d_find_alias(inode) had to be called before that commit
to find the dentry that was supposed to be removed. Linus then changed
the pty driver to stash away the dentry and subsequently got rid of the
inode_lock(). However, it seems that the inode_lock() is needed to
protect against the race outlined above. So add it back.
Note that this bug had been brought up before in November 2018 before
(cf. [1]). But a fix never got merged because a proper commit wasn't
sent. The issue came back up when Will and I talked about it at Kernel
Recipes in Paris. So here is a fix which prevents the issue. I very much
vote we get this merged asap, since as an unprivileged user I can do:
unshare -U --map-root --mount
mount -t devpts devpts
./<run_reproducer_below
/* Reproducer */
Note that the reproducer will take a little while. Will reported usually
around 10s. For me it took a few minutes.
#ifndef _GNU_SOURCE
#define _GNU_SOURCE 1
#endif
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
static void *readdir_thread(void *arg)
{
DIR *d = arg;
struct dirent *dent;
for (;;) {
errno = 0;
dent = readdir(d);
if (!dent) {
if (errno)
perror("readdir");
break;
}
rewinddir(d);
}
return NULL;
}
int main(void)
{
DIR *d;
pthread_t t;
int ret = 0;
d = opendir("/dev/pts");
if (!d) {
ret = errno;
perror("opendir");
exit(EXIT_FAILURE);
}
ret = pthread_create(&t, NULL, readdir_thread, d);
if (ret) {
errno = ret;
perror("pthread_create");
exit(EXIT_FAILURE);
}
for (;;) {
int dfd;
int fd;
dfd = dirfd(d);
if (dfd < 0) {
perror("dirfd");
break;
}
fd = openat(dirfd(d), "ptmx", O_RDONLY | O_NONBLOCK | O_CLOEXEC);
if (fd < 0) {
perror("openat");
break;
}
close(fd);
}
pthread_join(t, NULL);
closedir(d);
exit(EXIT_SUCCESS);
}
/* References */
[1]: https://lore.kernel.org/r/20181109143744.GA12128@hc
Fixes: 8ead9dd54716 ("devpts: more pty driver interface cleanups")
Cc: <stable(a)vger.kernel.org>
Cc: Jan Glauber <jglauber(a)marvell.com>
Cc: Alexander Viro <viro(a)zeniv.linux.org.uk>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Will Deacon <will(a)kernel.org>
Reviewed-by: Christian Brauner <christian.brauner(a)ubuntu.com>
[christian.brauner(a)ubuntu.com: dig into history and add context and reproducer to commit message]
Signed-off-by: Christian Brauner <christian.brauner(a)ubuntu.com>
---
fs/devpts/inode.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 42e5a766d33c..4b4546347aac 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -617,13 +617,18 @@ void *devpts_get_priv(struct dentry *dentry)
*/
void devpts_pty_kill(struct dentry *dentry)
{
- WARN_ON_ONCE(dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC);
+ struct super_block *sb = dentry->d_sb;
+ struct dentry *parent = sb->s_root;
+ WARN_ON_ONCE(sb->s_magic != DEVPTS_SUPER_MAGIC);
+
+ inode_lock(parent->d_inode);
dentry->d_fsdata = NULL;
drop_nlink(dentry->d_inode);
fsnotify_unlink(d_inode(dentry->d_parent), dentry);
d_drop(dentry);
dput(dentry); /* d_alloc_name() in devpts_pty_new() */
+ inode_unlock(parent->d_inode);
}
static int __init init_devpts_fs(void)
--
2.23.0