From: Thomas Gleixner tglx@linutronix.de
[ Upstream commit a7fed5c0431dbfa707037848830f980e0f93cfb3 ]
register_nmi_handler() has no sanity check whether a handler has been registered already. Such an unintended double-add leads to list corruption and hard to diagnose problems during the next NMI handling.
Init the list head in the static NMI action struct and check it for being empty in register_nmi_handler().
[ bp: Fixups. ]
Reported-by: Sean Christopherson seanjc@google.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Signed-off-by: Borislav Petkov bp@suse.de Link: https://lore.kernel.org/lkml/20220511234332.3654455-1-seanjc@google.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/include/asm/nmi.h | 1 + arch/x86/kernel/nmi.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 1cb9c17a4cb4..5c5f1e56c404 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -47,6 +47,7 @@ struct nmiaction { #define register_nmi_handler(t, fn, fg, n, init...) \ ({ \ static struct nmiaction init fn##_na = { \ + .list = LIST_HEAD_INIT(fn##_na.list), \ .handler = (fn), \ .name = (n), \ .flags = (fg), \ diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index e73f7df362f5..cec0bfa3bc04 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -157,7 +157,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) struct nmi_desc *desc = nmi_to_desc(type); unsigned long flags;
- if (!action->handler) + if (WARN_ON_ONCE(!action->handler || !list_empty(&action->list))) return -EINVAL;
raw_spin_lock_irqsave(&desc->lock, flags); @@ -177,7 +177,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) list_add_rcu(&action->list, &desc->head); else list_add_tail_rcu(&action->list, &desc->head); - + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -186,7 +186,7 @@ EXPORT_SYMBOL(__register_nmi_handler); void unregister_nmi_handler(unsigned int type, const char *name) { struct nmi_desc *desc = nmi_to_desc(type); - struct nmiaction *n; + struct nmiaction *n, *found = NULL; unsigned long flags;
raw_spin_lock_irqsave(&desc->lock, flags); @@ -200,12 +200,16 @@ void unregister_nmi_handler(unsigned int type, const char *name) WARN(in_nmi(), "Trying to free NMI (%s) from NMI context!\n", n->name); list_del_rcu(&n->list); + found = n; break; } }
raw_spin_unlock_irqrestore(&desc->lock, flags); - synchronize_rcu(); + if (found) { + synchronize_rcu(); + INIT_LIST_HEAD(&found->list); + } } EXPORT_SYMBOL_GPL(unregister_nmi_handler);
From: Sebastian Andrzej Siewior bigeasy@linutronix.de
[ Upstream commit 21673fcb2532dcd189905ff5a5389eb7dcd0e57a ]
The IRQ simulator uses irq_work to trigger an interrupt. Without the IRQ_WORK_HARD_IRQ flag the irq_work will be performed in thread context on PREEMPT_RT. This causes locking errors later in handle_simple_irq() which expects to be invoked with disabled interrupts.
Triggering individual interrupts in hardirq context should not lead to unexpected high latencies since this is also what the hardware controller does. Also it is used as a simulator so...
Use IRQ_WORK_INIT_HARD() to carry out the irq_work in hardirq context on PREEMPT_RT.
Signed-off-by: Sebastian Andrzej Siewior bigeasy@linutronix.de Signed-off-by: Thomas Gleixner tglx@linutronix.de Link: https://lore.kernel.org/r/YnuZBoEVMGwKkLm+@linutronix.de Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/irq/irq_sim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 0cd02efa3a74..dd76323ea3fd 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -181,7 +181,7 @@ struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode, goto err_free_bitmap;
work_ctx->irq_count = num_irqs; - init_irq_work(&work_ctx->work, irq_sim_handle_irq); + work_ctx->work = IRQ_WORK_INIT_HARD(irq_sim_handle_irq);
return work_ctx->domain;
From: "Maciej W. Rozycki" macro@orcam.me.uk
[ Upstream commit 92067440f1311dfa4d77b57a9da6b3706f5da32e ]
The frequency reported for clock sources are rounded down, which gives misleading figures, e.g.:
I/O ASIC clock frequency 24999480Hz sched_clock: 32 bits at 24MHz, resolution 40ns, wraps every 85901132779ns MIPS counter frequency 59998512Hz sched_clock: 32 bits at 59MHz, resolution 16ns, wraps every 35792281591ns
Rounding to nearest is more adequate:
I/O ASIC clock frequency 24999664Hz sched_clock: 32 bits at 25MHz, resolution 40ns, wraps every 85900499947ns MIPS counter frequency 59999728Hz sched_clock: 32 bits at 60MHz, resolution 16ns, wraps every 35791556599ns
Signed-off-by: Maciej W. Rozycki macro@orcam.me.uk Signed-off-by: Thomas Gleixner tglx@linutronix.de Acked-by: John Stultz jstultz@google.com Link: https://lore.kernel.org/r/alpine.DEB.2.21.2204240055590.9383@angie.orcam.me.... Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/time/sched_clock.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index b1b9b12899f5..ee07f3ac1e5b 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -8,6 +8,7 @@ #include <linux/jiffies.h> #include <linux/ktime.h> #include <linux/kernel.h> +#include <linux/math.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/sched/clock.h> @@ -199,11 +200,11 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
r = rate; if (r >= 4000000) { - r /= 1000000; + r = DIV_ROUND_CLOSEST(r, 1000000); r_unit = 'M'; } else { if (r >= 1000) { - r /= 1000; + r = DIV_ROUND_CLOSEST(r, 1000); r_unit = 'k'; } else { r_unit = ' ';
From: Thomas Gleixner tglx@linutronix.de
[ Upstream commit d664e399128bd78b905ff480917e2c2d4949e101 ]
A W=1 build emits more than a dozen missing prototype warnings related to scheduler and scheduler specific includes.
Reported-by: kernel test robot lkp@intel.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lore.kernel.org/r/20220413133024.249118058@linutronix.de Signed-off-by: Sasha Levin sashal@kernel.org --- include/linux/sched.h | 2 ++ kernel/sched/build_policy.c | 2 ++ kernel/sched/build_utility.c | 1 + kernel/sched/core.c | 3 +++ kernel/sched/deadline.c | 2 -- kernel/sched/fair.c | 1 + kernel/sched/sched.h | 8 ++------ kernel/sched/smp.h | 6 ++++++ kernel/stop_machine.c | 2 -- 9 files changed, 17 insertions(+), 10 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index a8911b1f35aa..74947048e3ad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2362,4 +2362,6 @@ static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } #endif
+extern void sched_set_stop_task(int cpu, struct task_struct *stop); + #endif diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index e0104b45029a..d9dc9ab3773f 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -15,6 +15,7 @@ /* Headers: */ #include <linux/sched/clock.h> #include <linux/sched/cputime.h> +#include <linux/sched/hotplug.h> #include <linux/sched/posix-timers.h> #include <linux/sched/rt.h>
@@ -31,6 +32,7 @@ #include <uapi/linux/sched/types.h>
#include "sched.h" +#include "smp.h"
#include "autogroup.h" #include "stats.h" diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index eec0849b2aae..99bdd96f454f 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -14,6 +14,7 @@ #include <linux/sched/debug.h> #include <linux/sched/isolation.h> #include <linux/sched/loadavg.h> +#include <linux/sched/nohz.h> #include <linux/sched/mm.h> #include <linux/sched/rseq_api.h> #include <linux/sched/task_stack.h> diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d58c0389eb23..ca71ebe2cd1d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,7 +26,10 @@ #include <linux/topology.h> #include <linux/sched/clock.h> #include <linux/sched/cond_resched.h> +#include <linux/sched/cputime.h> #include <linux/sched/debug.h> +#include <linux/sched/hotplug.h> +#include <linux/sched/init.h> #include <linux/sched/isolation.h> #include <linux/sched/loadavg.h> #include <linux/sched/mm.h> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fb4255ae0b2c..6ae423627a7a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1220,8 +1220,6 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) return (dl_se->runtime <= 0); }
-extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); - /* * This function implements the GRUB accounting rule: * according to the GRUB reclaiming algorithm, the runtime is diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a68482d66535..4edd47307cce 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -36,6 +36,7 @@ #include <linux/sched/cond_resched.h> #include <linux/sched/cputime.h> #include <linux/sched/isolation.h> +#include <linux/sched/nohz.h>
#include <linux/cpuidle.h> #include <linux/interrupt.h> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8dccb34eb190..137d5c7ea7b2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1827,12 +1827,7 @@ static inline void dirty_sched_domain_sysctl(int cpu) #endif
extern int sched_update_scaling(void); - -extern void flush_smp_call_function_from_idle(void); - -#else /* !CONFIG_SMP: */ -static inline void flush_smp_call_function_from_idle(void) { } -#endif +#endif /* CONFIG_SMP */
#include "stats.h"
@@ -2309,6 +2304,7 @@ extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h index 9620e323162c..5719bf9280e9 100644 --- a/kernel/sched/smp.h +++ b/kernel/sched/smp.h @@ -7,3 +7,9 @@ extern void sched_ttwu_pending(void *arg);
extern void send_call_function_single_ipi(int cpu); + +#ifdef CONFIG_SMP +extern void flush_smp_call_function_from_idle(void); +#else +static inline void flush_smp_call_function_from_idle(void) { } +#endif diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index cbc30271ea4d..6da7b91af353 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -535,8 +535,6 @@ void stop_machine_park(int cpu) kthread_park(stopper->thread); }
-extern void sched_set_stop_task(int cpu, struct task_struct *stop); - static void cpu_stop_create(unsigned int cpu) { sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));
From: Sebastian Andrzej Siewior bigeasy@linutronix.de
[ Upstream commit 75d8cce128c516fe6cf4b8683e8fe1a59e919902 ]
irq_poll_cpu_dead() pulls the blk_cpu_iopoll backlog from the dead CPU and raises the POLL softirq with __raise_softirq_irqoff() on the CPU it is running on. That just sets the bit in the pending softirq mask.
This means the handling of the softirq is delayed until the next interrupt or a local_bh_disable/enable() pair. As a consequence the CPU on which this code runs can reach idle with the POLL softirq pending, which triggers a warning in the NOHZ idle code.
Add a local_bh_disable/enable() pair around the interrupts disabled section in irq_poll_cpu_dead(). local_bh_enable will handle the pending softirq.
[tglx: Massaged changelog and comment]
Signed-off-by: Sebastian Andrzej Siewior bigeasy@linutronix.de Signed-off-by: Thomas Gleixner tglx@linutronix.de Link: https://lore.kernel.org/r/87k0bxgl27.ffs@tglx Signed-off-by: Sasha Levin sashal@kernel.org --- lib/irq_poll.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/lib/irq_poll.c b/lib/irq_poll.c index 2f17b488d58e..2d5329a42105 100644 --- a/lib/irq_poll.c +++ b/lib/irq_poll.c @@ -188,14 +188,18 @@ EXPORT_SYMBOL(irq_poll_init); static int irq_poll_cpu_dead(unsigned int cpu) { /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq + * If a CPU goes away, splice its entries to the current CPU and + * set the POLL softirq bit. The local_bh_disable()/enable() pair + * ensures that it is handled. Otherwise the current CPU could + * reach idle with the POLL softirq pending. */ + local_bh_disable(); local_irq_disable(); list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), this_cpu_ptr(&blk_cpu_iopoll)); __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); local_irq_enable(); + local_bh_enable();
return 0; }
From: Marc Zyngier maz@kernel.org
[ Upstream commit d802057c7c553ad426520a053da9f9fe08e2c35a ]
When booting with maxcpus=<small number>, interrupt controllers such as the GICv3 ITS may not be able to satisfy the affinity of some managed interrupts, as some of the HW resources are simply not available.
The same thing happens when loading a driver using managed interrupts while CPUs are offline.
In order to deal with this, do not try to activate such interrupt if there is no online CPU capable of handling it. Instead, place it in shutdown state. Once a capable CPU shows up, it will be activated.
Reported-by: John Garry john.garry@huawei.com Reported-by: David Decotigny ddecotig@google.com Signed-off-by: Marc Zyngier maz@kernel.org Signed-off-by: Thomas Gleixner tglx@linutronix.de Tested-by: John Garry john.garry@huawei.com Link: https://lore.kernel.org/r/20220405185040.206297-2-maz@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/irq/msi.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+)
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 2bdfce5edafd..a9ee535293eb 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -818,6 +818,21 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag irqd_clr_can_reserve(irqd); if (vflags & VIRQ_NOMASK_QUIRK) irqd_set_msi_nomask_quirk(irqd); + + /* + * If the interrupt is managed but no CPU is available to + * service it, shut it down until better times. Note that + * we only do this on the !RESERVE path as x86 (the only + * architecture using this flag) deals with this in a + * different way by using a catch-all vector. + */ + if ((vflags & VIRQ_ACTIVATE) && + irqd_affinity_is_managed(irqd) && + !cpumask_intersects(irq_data_get_affinity_mask(irqd), + cpu_online_mask)) { + irqd_set_managed_shutdown(irqd); + return 0; + } }
if (!(vflags & VIRQ_ACTIVATE))
From: "Maciej W. Rozycki" macro@orcam.me.uk
[ Upstream commit 5d64089aa4a5bd3d7e00e3d6ddf4943dd34627b3 ]
Verify that the PCI IRQ Routing Table header as well as individual slot entries are all wholly contained within the BIOS memory area. Do not even call the checksum calculator if the header would overrun the area and then bail out early if any slot would.
Signed-off-by: Maciej W. Rozycki macro@orcam.me.uk Signed-off-by: Thomas Gleixner tglx@linutronix.de Link: https://lore.kernel.org/r/alpine.DEB.2.21.2203301735510.22465@angie.orcam.me... Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/pci/irq.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 97b63e35e152..13513003303e 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -68,7 +68,8 @@ void (*pcibios_disable_irq)(struct pci_dev *dev) = pirq_disable_irq; * and perform checksum verification. */
-static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr) +static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr, + u8 *limit) { struct irq_routing_table *rt; int i; @@ -78,7 +79,8 @@ static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr) if (rt->signature != PIRQ_SIGNATURE || rt->version != PIRQ_VERSION || rt->size % 16 || - rt->size < sizeof(struct irq_routing_table)) + rt->size < sizeof(struct irq_routing_table) || + (limit && rt->size > limit - addr)) return NULL; sum = 0; for (i = 0; i < rt->size; i++) @@ -99,17 +101,22 @@ static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
static struct irq_routing_table * __init pirq_find_routing_table(void) { + u8 * const bios_start = (u8 *)__va(0xf0000); + u8 * const bios_end = (u8 *)__va(0x100000); u8 *addr; struct irq_routing_table *rt;
if (pirq_table_addr) { - rt = pirq_check_routing_table((u8 *) __va(pirq_table_addr)); + rt = pirq_check_routing_table((u8 *)__va(pirq_table_addr), + NULL); if (rt) return rt; printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n"); } - for (addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) { - rt = pirq_check_routing_table(addr); + for (addr = bios_start; + addr < bios_end - sizeof(struct irq_routing_table); + addr += 16) { + rt = pirq_check_routing_table(addr, bios_end); if (rt) return rt; }
linux-stable-mirror@lists.linaro.org