From: Raghavendra K T raghavendra.kt@amd.com
[ Upstream commit 84db47ca7146d7bd00eb5cf2b93989a971c84650 ]
Since commit fc137c0ddab2 ("sched/numa: enhance vma scanning logic")
NUMA Balancing allows updating PTEs to trap NUMA hinting faults if the task had previously accessed VMA. However unconditional scan of VMAs are allowed during initial phase of VMA creation until process's mm numa_scan_seq reaches 2 even though current task had not accessed VMA.
Rationale: - Without initial scan subsequent PTE update may never happen. - Give fair opportunity to all the VMAs to be scanned and subsequently understand the access pattern of all the VMAs.
But it has a corner case where, if a VMA is created after some time, process's mm numa_scan_seq could be already greater than 2.
For e.g., values of mm numa_scan_seq when VMAs are created by running mmtest autonuma benchmark briefly looks like: start_seq=0 : 459 start_seq=2 : 138 start_seq=3 : 144 start_seq=4 : 8 start_seq=8 : 1 start_seq=9 : 1 This results in no unconditional PTE updates for those VMAs created after some time.
Fix: - Note down the initial value of mm numa_scan_seq in per VMA start_seq. - Allow unconditional scan till start_seq + 2.
Result: SUT: AMD EPYC Milan with 2 NUMA nodes 256 cpus. base kernel: upstream 6.6-rc6 with Mels patches [1] applied.
kernbench ========== base patched %gain Amean elsp-128 165.09 ( 0.00%) 164.78 * 0.19%*
Duration User 41404.28 41375.08 Duration System 9862.22 9768.48 Duration Elapsed 519.87 518.72
Ops NUMA PTE updates 1041416.00 831536.00 Ops NUMA hint faults 263296.00 220966.00 Ops NUMA pages migrated 258021.00 212769.00 Ops AutoNUMA cost 1328.67 1114.69
autonumabench
NUMA01_THREADLOCAL ================== Amean elsp-NUMA01_THREADLOCAL 81.79 (0.00%) 67.74 * 17.18%*
Duration User 54832.73 47379.67 Duration System 75.00 185.75 Duration Elapsed 576.72 476.09
Ops NUMA PTE updates 394429.00 11121044.00 Ops NUMA hint faults 1001.00 8906404.00 Ops NUMA pages migrated 288.00 2998694.00 Ops AutoNUMA cost 7.77 44666.84
Signed-off-by: Raghavendra K T raghavendra.kt@amd.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Acked-by: Mel Gorman mgorman@suse.de Link: https://lore.kernel.org/r/2ea7cbce80ac7c62e90cbfb9653a7972f902439f.169781669... Signed-off-by: Sasha Levin sashal@kernel.org --- include/linux/mm_types.h | 3 +++ kernel/sched/fair.c | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 957ce38768b2..950df415d7de 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -600,6 +600,9 @@ struct vma_numab_state { */ unsigned long pids_active[2];
+ /* MM scan sequence ID when scan first started after VMA creation */ + int start_scan_seq; + /* * MM scan sequence ID when the VMA was last completely scanned. * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d7a3c63a2171..44b5262b6657 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3164,7 +3164,7 @@ static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) * This is also done to avoid any side effect of task scanning * amplifying the unfairness of disjoint set of VMAs' access. */ - if (READ_ONCE(current->mm->numa_scan_seq) < 2) + if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2) return true;
pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; @@ -3307,6 +3307,8 @@ static void task_numa_work(struct callback_head *work) if (!vma->numab_state) continue;
+ vma->numab_state->start_scan_seq = mm->numa_scan_seq; + vma->numab_state->next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
From: Greg KH gregkh@linuxfoundation.org
[ Upstream commit 652ffc2104ec1f69dd4a46313888c33527145ccf ]
Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lkml.kernel.org/r/2023061204-decal-flyable-6090@gregkh Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/events/core.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c index 9efd0d7775e7..fbecba5b00b1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11425,9 +11425,32 @@ static DEVICE_ATTR_RW(perf_event_mux_interval_ms); static struct attribute *pmu_dev_attrs[] = { &dev_attr_type.attr, &dev_attr_perf_event_mux_interval_ms.attr, + &dev_attr_nr_addr_filters.attr, + NULL, +}; + +static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct pmu *pmu = dev_get_drvdata(dev); + + if (!pmu->nr_addr_filters) + return 0; + + return a->mode; + + return 0; +} + +static struct attribute_group pmu_dev_attr_group = { + .is_visible = pmu_dev_is_visible, + .attrs = pmu_dev_attrs, +}; + +static const struct attribute_group *pmu_dev_groups[] = { + &pmu_dev_attr_group, NULL, }; -ATTRIBUTE_GROUPS(pmu_dev);
static int pmu_bus_running; static struct bus_type pmu_bus = { @@ -11464,18 +11487,11 @@ static int pmu_dev_alloc(struct pmu *pmu) if (ret) goto free_dev;
- /* For PMUs with address filters, throw in an extra attribute: */ - if (pmu->nr_addr_filters) - ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters); - - if (ret) - goto del_dev; - - if (pmu->attr_update) + if (pmu->attr_update) { ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update); - - if (ret) - goto del_dev; + if (ret) + goto del_dev; + }
out: return ret;
From: Andrzej Hajda andrzej.hajda@intel.com
[ Upstream commit 9bb6362652f3f4d74a87d572a91ee1b38e673ef6 ]
After release of the hashbucket lock the tracking object can be modified or freed by a concurrent thread. Using it in such a case is error prone, even for printing the object state:
1. T1 tries to deactivate destroyed object, debugobjects detects it, hash bucket lock is released.
2. T2 preempts T1 and frees the tracking object.
3. The freed tracking object is allocated and initialized for a different to be tracked kernel object.
4. T1 resumes and reports error for wrong kernel object.
Create a local copy of the tracking object before releasing the hash bucket lock and use the local copy for reporting and fixups to prevent this.
Signed-off-by: Andrzej Hajda andrzej.hajda@intel.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Link: https://lore.kernel.org/r/20231025-debugobjects_fix-v3-1-2bc3bf7084c2@intel.... Signed-off-by: Sasha Levin sashal@kernel.org --- lib/debugobjects.c | 200 ++++++++++++++++++--------------------------- 1 file changed, 78 insertions(+), 122 deletions(-)
diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 2a8e9d63fbe3..fb12a9bacd2f 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -620,9 +620,8 @@ static void debug_objects_fill_pool(void) static void __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack) { - enum debug_obj_state state; + struct debug_obj *obj, o; struct debug_bucket *db; - struct debug_obj *obj; unsigned long flags;
debug_objects_fill_pool(); @@ -643,24 +642,18 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack case ODEBUG_STATE_INIT: case ODEBUG_STATE_INACTIVE: obj->state = ODEBUG_STATE_INIT; - break; - - case ODEBUG_STATE_ACTIVE: - state = obj->state; - raw_spin_unlock_irqrestore(&db->lock, flags); - debug_print_object(obj, "init"); - debug_object_fixup(descr->fixup_init, addr, state); - return; - - case ODEBUG_STATE_DESTROYED: raw_spin_unlock_irqrestore(&db->lock, flags); - debug_print_object(obj, "init"); return; default: break; }
+ o = *obj; raw_spin_unlock_irqrestore(&db->lock, flags); + debug_print_object(&o, "init"); + + if (o.state == ODEBUG_STATE_ACTIVE) + debug_object_fixup(descr->fixup_init, addr, o.state); }
/** @@ -701,11 +694,9 @@ EXPORT_SYMBOL_GPL(debug_object_init_on_stack); int debug_object_activate(void *addr, const struct debug_obj_descr *descr) { struct debug_obj o = { .object = addr, .state = ODEBUG_STATE_NOTAVAILABLE, .descr = descr }; - enum debug_obj_state state; struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; - int ret;
if (!debug_objects_enabled) return 0; @@ -717,49 +708,38 @@ int debug_object_activate(void *addr, const struct debug_obj_descr *descr) raw_spin_lock_irqsave(&db->lock, flags);
obj = lookup_object_or_alloc(addr, db, descr, false, true); - if (likely(!IS_ERR_OR_NULL(obj))) { - bool print_object = false; - + if (unlikely(!obj)) { + raw_spin_unlock_irqrestore(&db->lock, flags); + debug_objects_oom(); + return 0; + } else if (likely(!IS_ERR(obj))) { switch (obj->state) { - case ODEBUG_STATE_INIT: - case ODEBUG_STATE_INACTIVE: - obj->state = ODEBUG_STATE_ACTIVE; - ret = 0; - break; - case ODEBUG_STATE_ACTIVE: - state = obj->state; - raw_spin_unlock_irqrestore(&db->lock, flags); - debug_print_object(obj, "activate"); - ret = debug_object_fixup(descr->fixup_activate, addr, state); - return ret ? 0 : -EINVAL; - case ODEBUG_STATE_DESTROYED: - print_object = true; - ret = -EINVAL; + o = *obj; break; + case ODEBUG_STATE_INIT: + case ODEBUG_STATE_INACTIVE: + obj->state = ODEBUG_STATE_ACTIVE; + fallthrough; default: - ret = 0; - break; + raw_spin_unlock_irqrestore(&db->lock, flags); + return 0; } - raw_spin_unlock_irqrestore(&db->lock, flags); - if (print_object) - debug_print_object(obj, "activate"); - return ret; }
raw_spin_unlock_irqrestore(&db->lock, flags); + debug_print_object(&o, "activate");
- /* If NULL the allocation has hit OOM */ - if (!obj) { - debug_objects_oom(); - return 0; + switch (o.state) { + case ODEBUG_STATE_ACTIVE: + case ODEBUG_STATE_NOTAVAILABLE: + if (debug_object_fixup(descr->fixup_activate, addr, o.state)) + return 0; + fallthrough; + default: + return -EINVAL; } - - /* Object is neither static nor tracked. It's not initialized */ - debug_print_object(&o, "activate"); - ret = debug_object_fixup(descr->fixup_activate, addr, ODEBUG_STATE_NOTAVAILABLE); - return ret ? 0 : -EINVAL; } EXPORT_SYMBOL_GPL(debug_object_activate);
@@ -770,10 +750,10 @@ EXPORT_SYMBOL_GPL(debug_object_activate); */ void debug_object_deactivate(void *addr, const struct debug_obj_descr *descr) { + struct debug_obj o = { .object = addr, .state = ODEBUG_STATE_NOTAVAILABLE, .descr = descr }; struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; - bool print_object = false;
if (!debug_objects_enabled) return; @@ -785,33 +765,24 @@ void debug_object_deactivate(void *addr, const struct debug_obj_descr *descr) obj = lookup_object(addr, db); if (obj) { switch (obj->state) { + case ODEBUG_STATE_DESTROYED: + break; case ODEBUG_STATE_INIT: case ODEBUG_STATE_INACTIVE: case ODEBUG_STATE_ACTIVE: - if (!obj->astate) - obj->state = ODEBUG_STATE_INACTIVE; - else - print_object = true; - break; - - case ODEBUG_STATE_DESTROYED: - print_object = true; - break; + if (obj->astate) + break; + obj->state = ODEBUG_STATE_INACTIVE; + fallthrough; default: - break; + raw_spin_unlock_irqrestore(&db->lock, flags); + return; } + o = *obj; }
raw_spin_unlock_irqrestore(&db->lock, flags); - if (!obj) { - struct debug_obj o = { .object = addr, - .state = ODEBUG_STATE_NOTAVAILABLE, - .descr = descr }; - - debug_print_object(&o, "deactivate"); - } else if (print_object) { - debug_print_object(obj, "deactivate"); - } + debug_print_object(&o, "deactivate"); } EXPORT_SYMBOL_GPL(debug_object_deactivate);
@@ -822,11 +793,9 @@ EXPORT_SYMBOL_GPL(debug_object_deactivate); */ void debug_object_destroy(void *addr, const struct debug_obj_descr *descr) { - enum debug_obj_state state; + struct debug_obj *obj, o; struct debug_bucket *db; - struct debug_obj *obj; unsigned long flags; - bool print_object = false;
if (!debug_objects_enabled) return; @@ -836,32 +805,31 @@ void debug_object_destroy(void *addr, const struct debug_obj_descr *descr) raw_spin_lock_irqsave(&db->lock, flags);
obj = lookup_object(addr, db); - if (!obj) - goto out_unlock; + if (!obj) { + raw_spin_unlock_irqrestore(&db->lock, flags); + return; + }
switch (obj->state) { + case ODEBUG_STATE_ACTIVE: + case ODEBUG_STATE_DESTROYED: + break; case ODEBUG_STATE_NONE: case ODEBUG_STATE_INIT: case ODEBUG_STATE_INACTIVE: obj->state = ODEBUG_STATE_DESTROYED; - break; - case ODEBUG_STATE_ACTIVE: - state = obj->state; + fallthrough; + default: raw_spin_unlock_irqrestore(&db->lock, flags); - debug_print_object(obj, "destroy"); - debug_object_fixup(descr->fixup_destroy, addr, state); return; - - case ODEBUG_STATE_DESTROYED: - print_object = true; - break; - default: - break; } -out_unlock: + + o = *obj; raw_spin_unlock_irqrestore(&db->lock, flags); - if (print_object) - debug_print_object(obj, "destroy"); + debug_print_object(&o, "destroy"); + + if (o.state == ODEBUG_STATE_ACTIVE) + debug_object_fixup(descr->fixup_destroy, addr, o.state); } EXPORT_SYMBOL_GPL(debug_object_destroy);
@@ -872,9 +840,8 @@ EXPORT_SYMBOL_GPL(debug_object_destroy); */ void debug_object_free(void *addr, const struct debug_obj_descr *descr) { - enum debug_obj_state state; + struct debug_obj *obj, o; struct debug_bucket *db; - struct debug_obj *obj; unsigned long flags;
if (!debug_objects_enabled) @@ -885,24 +852,26 @@ void debug_object_free(void *addr, const struct debug_obj_descr *descr) raw_spin_lock_irqsave(&db->lock, flags);
obj = lookup_object(addr, db); - if (!obj) - goto out_unlock; + if (!obj) { + raw_spin_unlock_irqrestore(&db->lock, flags); + return; + }
switch (obj->state) { case ODEBUG_STATE_ACTIVE: - state = obj->state; - raw_spin_unlock_irqrestore(&db->lock, flags); - debug_print_object(obj, "free"); - debug_object_fixup(descr->fixup_free, addr, state); - return; + break; default: hlist_del(&obj->node); raw_spin_unlock_irqrestore(&db->lock, flags); free_object(obj); return; } -out_unlock: + + o = *obj; raw_spin_unlock_irqrestore(&db->lock, flags); + debug_print_object(&o, "free"); + + debug_object_fixup(descr->fixup_free, addr, o.state); } EXPORT_SYMBOL_GPL(debug_object_free);
@@ -954,10 +923,10 @@ void debug_object_active_state(void *addr, const struct debug_obj_descr *descr, unsigned int expect, unsigned int next) { + struct debug_obj o = { .object = addr, .state = ODEBUG_STATE_NOTAVAILABLE, .descr = descr }; struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; - bool print_object = false;
if (!debug_objects_enabled) return; @@ -970,28 +939,19 @@ debug_object_active_state(void *addr, const struct debug_obj_descr *descr, if (obj) { switch (obj->state) { case ODEBUG_STATE_ACTIVE: - if (obj->astate == expect) - obj->astate = next; - else - print_object = true; - break; - + if (obj->astate != expect) + break; + obj->astate = next; + raw_spin_unlock_irqrestore(&db->lock, flags); + return; default: - print_object = true; break; } + o = *obj; }
raw_spin_unlock_irqrestore(&db->lock, flags); - if (!obj) { - struct debug_obj o = { .object = addr, - .state = ODEBUG_STATE_NOTAVAILABLE, - .descr = descr }; - - debug_print_object(&o, "active_state"); - } else if (print_object) { - debug_print_object(obj, "active_state"); - } + debug_print_object(&o, "active_state"); } EXPORT_SYMBOL_GPL(debug_object_active_state);
@@ -999,12 +959,10 @@ EXPORT_SYMBOL_GPL(debug_object_active_state); static void __debug_check_no_obj_freed(const void *address, unsigned long size) { unsigned long flags, oaddr, saddr, eaddr, paddr, chunks; - const struct debug_obj_descr *descr; - enum debug_obj_state state; + int cnt, objs_checked = 0; + struct debug_obj *obj, o; struct debug_bucket *db; struct hlist_node *tmp; - struct debug_obj *obj; - int cnt, objs_checked = 0;
saddr = (unsigned long) address; eaddr = saddr + size; @@ -1026,12 +984,10 @@ static void __debug_check_no_obj_freed(const void *address, unsigned long size)
switch (obj->state) { case ODEBUG_STATE_ACTIVE: - descr = obj->descr; - state = obj->state; + o = *obj; raw_spin_unlock_irqrestore(&db->lock, flags); - debug_print_object(obj, "free"); - debug_object_fixup(descr->fixup_free, - (void *) oaddr, state); + debug_print_object(&o, "free"); + debug_object_fixup(o.descr->fixup_free, (void *)oaddr, o.state); goto repeat; default: hlist_del(&obj->node);
From: Vincent Guittot vincent.guittot@linaro.org
[ Upstream commit 9942cb22ea458c34fa17b73d143ea32d4df1caca ]
Create a new method to get a unique and fixed max frequency. Currently cpuinfo.max_freq or the highest (or last) state of performance domain are used as the max frequency when computing the frequency for a level of utilization, but:
- cpuinfo_max_freq can change at runtime. boost is one example of such change.
- cpuinfo.max_freq and last item of the PD can be different leading to different results between cpufreq and energy model.
We need to save the reference frequency that has been used when computing the CPUs capacity and use this fixed and coherent value to convert between frequency and CPU's capacity.
In fact, we already save the frequency that has been used when computing the capacity of each CPU. We extend the precision to save kHz instead of MHz currently and we modify the type to be aligned with other variables used when converting frequency to capacity and the other way.
[ mingo: Minor edits. ]
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org Signed-off-by: Ingo Molnar mingo@kernel.org Tested-by: Lukasz Luba lukasz.luba@arm.com Reviewed-by: Lukasz Luba lukasz.luba@arm.com Acked-by: Sudeep Holla sudeep.holla@arm.com Link: https://lore.kernel.org/r/20231211104855.558096-2-vincent.guittot@linaro.org Signed-off-by: Sasha Levin sashal@kernel.org --- arch/arm/include/asm/topology.h | 1 + arch/arm64/include/asm/topology.h | 1 + arch/riscv/include/asm/topology.h | 1 + drivers/base/arch_topology.c | 29 ++++++++++++++--------------- include/linux/arch_topology.h | 7 +++++++ include/linux/sched/topology.h | 8 ++++++++ 6 files changed, 32 insertions(+), 15 deletions(-)
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index c7d2510e5a78..853c4f81ba4a 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -13,6 +13,7 @@ #define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_invariant topology_scale_freq_invariant +#define arch_scale_freq_ref topology_get_freq_ref #endif
/* Replace task scheduler's default cpu-invariant accounting */ diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 9fab663dd2de..a323b109b9c4 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -23,6 +23,7 @@ void update_freq_counters_refs(void); #define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_invariant topology_scale_freq_invariant +#define arch_scale_freq_ref topology_get_freq_ref
#ifdef CONFIG_ACPI_CPPC_LIB #define arch_init_invariance_cppc topology_init_cpu_capacity_cppc diff --git a/arch/riscv/include/asm/topology.h b/arch/riscv/include/asm/topology.h index e316ab3b77f3..61183688bdd5 100644 --- a/arch/riscv/include/asm/topology.h +++ b/arch/riscv/include/asm/topology.h @@ -9,6 +9,7 @@ #define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_invariant topology_scale_freq_invariant +#define arch_scale_freq_ref topology_get_freq_ref
/* Replace task scheduler's default cpu-invariant accounting */ #define arch_scale_cpu_capacity topology_get_cpu_scale diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index b741b5ba82bd..0c9ae5b157b1 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -19,6 +19,7 @@ #include <linux/init.h> #include <linux/rcupdate.h> #include <linux/sched.h> +#include <linux/units.h>
#define CREATE_TRACE_POINTS #include <trace/events/thermal_pressure.h> @@ -26,7 +27,8 @@ static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); static struct cpumask scale_freq_counters_mask; static bool scale_freq_invariant; -static DEFINE_PER_CPU(u32, freq_factor) = 1; +DEFINE_PER_CPU(unsigned long, capacity_freq_ref) = 1; +EXPORT_PER_CPU_SYMBOL_GPL(capacity_freq_ref);
static bool supports_scale_freq_counters(const struct cpumask *cpus) { @@ -170,9 +172,9 @@ DEFINE_PER_CPU(unsigned long, thermal_pressure); * operating on stale data when hot-plug is used for some CPUs. The * @capped_freq reflects the currently allowed max CPUs frequency due to * thermal capping. It might be also a boost frequency value, which is bigger - * than the internal 'freq_factor' max frequency. In such case the pressure - * value should simply be removed, since this is an indication that there is - * no thermal throttling. The @capped_freq must be provided in kHz. + * than the internal 'capacity_freq_ref' max frequency. In such case the + * pressure value should simply be removed, since this is an indication that + * there is no thermal throttling. The @capped_freq must be provided in kHz. */ void topology_update_thermal_pressure(const struct cpumask *cpus, unsigned long capped_freq) @@ -183,10 +185,7 @@ void topology_update_thermal_pressure(const struct cpumask *cpus,
cpu = cpumask_first(cpus); max_capacity = arch_scale_cpu_capacity(cpu); - max_freq = per_cpu(freq_factor, cpu); - - /* Convert to MHz scale which is used in 'freq_factor' */ - capped_freq /= 1000; + max_freq = arch_scale_freq_ref(cpu);
/* * Handle properly the boost frequencies, which should simply clean @@ -279,13 +278,13 @@ void topology_normalize_cpu_scale(void)
capacity_scale = 1; for_each_possible_cpu(cpu) { - capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu); + capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu); capacity_scale = max(capacity, capacity_scale); }
pr_debug("cpu_capacity: capacity_scale=%llu\n", capacity_scale); for_each_possible_cpu(cpu) { - capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu); + capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu); capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT, capacity_scale); topology_set_cpu_scale(cpu, capacity); @@ -321,15 +320,15 @@ bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu) cpu_node, raw_capacity[cpu]);
/* - * Update freq_factor for calculating early boot cpu capacities. + * Update capacity_freq_ref for calculating early boot CPU capacities. * For non-clk CPU DVFS mechanism, there's no way to get the * frequency value now, assuming they are running at the same - * frequency (by keeping the initial freq_factor value). + * frequency (by keeping the initial capacity_freq_ref value). */ cpu_clk = of_clk_get(cpu_node, 0); if (!PTR_ERR_OR_ZERO(cpu_clk)) { - per_cpu(freq_factor, cpu) = - clk_get_rate(cpu_clk) / 1000; + per_cpu(capacity_freq_ref, cpu) = + clk_get_rate(cpu_clk) / HZ_PER_KHZ; clk_put(cpu_clk); } } else { @@ -411,7 +410,7 @@ init_cpu_capacity_callback(struct notifier_block *nb, cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus);
for_each_cpu(cpu, policy->related_cpus) - per_cpu(freq_factor, cpu) = policy->cpuinfo.max_freq / 1000; + per_cpu(capacity_freq_ref, cpu) = policy->cpuinfo.max_freq;
if (cpumask_empty(cpus_to_visit)) { topology_normalize_cpu_scale(); diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index a07b510e7dc5..32c24ff4f2a8 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -27,6 +27,13 @@ static inline unsigned long topology_get_cpu_scale(int cpu)
void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity);
+DECLARE_PER_CPU(unsigned long, capacity_freq_ref); + +static inline unsigned long topology_get_freq_ref(int cpu) +{ + return per_cpu(capacity_freq_ref, cpu); +} + DECLARE_PER_CPU(unsigned long, arch_freq_scale);
static inline unsigned long topology_get_freq_scale(int cpu) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index de545ba85218..a6e04b4a21d7 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -279,6 +279,14 @@ void arch_update_thermal_pressure(const struct cpumask *cpus, { } #endif
+#ifndef arch_scale_freq_ref +static __always_inline +unsigned int arch_scale_freq_ref(int cpu) +{ + return 0; +} +#endif + static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p));
From: Vincent Guittot vincent.guittot@linaro.org
[ Upstream commit 599457ba15403037b489fe536266a3d5f9efaed7 ]
cpuinfo.max_freq can change at runtime because of boost as an example. This implies that the value could be different from the frequency that has been used to compute the capacity of a CPU.
The new arch_scale_freq_ref() returns a fixed and coherent frequency that can be used to compute the capacity for a given frequency.
[ Also fix a arch_set_freq_scale() newline style wart in <linux/cpufreq.h>. ]
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org Signed-off-by: Ingo Molnar mingo@kernel.org Tested-by: Lukasz Luba lukasz.luba@arm.com Reviewed-by: Lukasz Luba lukasz.luba@arm.com Acked-by: Viresh Kumar viresh.kumar@linaro.org Acked-by: Rafael J. Wysocki rafael@kernel.org Link: https://lore.kernel.org/r/20231211104855.558096-3-vincent.guittot@linaro.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/cpufreq/cpufreq.c | 4 ++-- include/linux/cpufreq.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 934d35f570b7..44db4f59c4cc 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -454,7 +454,7 @@ void cpufreq_freq_transition_end(struct cpufreq_policy *policy,
arch_set_freq_scale(policy->related_cpus, policy->cur, - policy->cpuinfo.max_freq); + arch_scale_freq_ref(policy->cpu));
spin_lock(&policy->transition_lock); policy->transition_ongoing = false; @@ -2174,7 +2174,7 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
policy->cur = freq; arch_set_freq_scale(policy->related_cpus, freq, - policy->cpuinfo.max_freq); + arch_scale_freq_ref(policy->cpu)); cpufreq_stats_record_transition(policy, freq);
if (trace_cpu_frequency_enabled()) { diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 1c5ca92a0555..afda5f24d3dd 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1203,6 +1203,7 @@ void arch_set_freq_scale(const struct cpumask *cpus, { } #endif + /* the following are really really optional */ extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs; extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs;
From: Vincent Guittot vincent.guittot@linaro.org
[ Upstream commit b3edde44e5d4504c23a176819865cd603fd16d6c ]
cpuinfo.max_freq can change at runtime because of boost as an example. This implies that the value could be different than the one that has been used when computing the capacity of a CPU.
The new arch_scale_freq_ref() returns a fixed and coherent reference frequency that can be used when computing a frequency based on utilization.
Use this arch_scale_freq_ref() when available and fallback to policy otherwise.
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org Signed-off-by: Ingo Molnar mingo@kernel.org Tested-by: Lukasz Luba lukasz.luba@arm.com Reviewed-by: Lukasz Luba lukasz.luba@arm.com Reviewed-by: Dietmar Eggemann dietmar.eggemann@arm.com Acked-by: Rafael J. Wysocki rafael@kernel.org Acked-by: Viresh Kumar viresh.kumar@linaro.org Link: https://lore.kernel.org/r/20231211104855.558096-4-vincent.guittot@linaro.org Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/sched/cpufreq_schedutil.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5888176354e2..11422c054c84 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -114,6 +114,28 @@ static void sugov_deferred_update(struct sugov_policy *sg_policy) } }
+/** + * get_capacity_ref_freq - get the reference frequency that has been used to + * correlate frequency and compute capacity for a given cpufreq policy. We use + * the CPU managing it for the arch_scale_freq_ref() call in the function. + * @policy: the cpufreq policy of the CPU in question. + * + * Return: the reference CPU frequency to compute a capacity. + */ +static __always_inline +unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy) +{ + unsigned int freq = arch_scale_freq_ref(policy->cpu); + + if (freq) + return freq; + + if (arch_scale_freq_invariant()) + return policy->cpuinfo.max_freq; + + return policy->cur; +} + /** * get_next_freq - Compute a new frequency for a given cpufreq policy. * @sg_policy: schedutil policy object to compute the new frequency for. @@ -140,9 +162,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned long util, unsigned long max) { struct cpufreq_policy *policy = sg_policy->policy; - unsigned int freq = arch_scale_freq_invariant() ? - policy->cpuinfo.max_freq : policy->cur; + unsigned int freq;
+ freq = get_capacity_ref_freq(policy); util = map_util_perf(util); freq = map_util_freq(util, freq, max);
From: Vincent Guittot vincent.guittot@linaro.org
[ Upstream commit 15cbbd1d317e07b4e5c6aca5d4c5579539a82784 ]
The last item of a performance domain is not always the performance point that has been used to compute CPU's capacity. This can lead to different target frequency compared with other part of the system like schedutil and would result in wrong energy estimation.
A new arch_scale_freq_ref() is available to return a fixed and coherent frequency reference that can be used when computing the CPU's frequency for an level of utilization. Use this function to get this reference frequency.
Energy model is never used without defining arch_scale_freq_ref() but can be compiled. Define a default arch_scale_freq_ref() returning 0 in such case.
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org Signed-off-by: Ingo Molnar mingo@kernel.org Tested-by: Lukasz Luba lukasz.luba@arm.com Reviewed-by: Lukasz Luba lukasz.luba@arm.com Link: https://lore.kernel.org/r/20231211104855.558096-5-vincent.guittot@linaro.org Signed-off-by: Sasha Levin sashal@kernel.org --- include/linux/energy_model.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index b9caa01dfac4..c19e7effe764 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -224,7 +224,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, unsigned long max_util, unsigned long sum_util, unsigned long allowed_cpu_cap) { - unsigned long freq, scale_cpu; + unsigned long freq, ref_freq, scale_cpu; struct em_perf_state *ps; int cpu;
@@ -241,11 +241,11 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, */ cpu = cpumask_first(to_cpumask(pd->cpus)); scale_cpu = arch_scale_cpu_capacity(cpu); - ps = &pd->table[pd->nr_perf_states - 1]; + ref_freq = arch_scale_freq_ref(cpu);
max_util = map_util_perf(max_util); max_util = min(max_util, allowed_cpu_cap); - freq = map_util_freq(max_util, ps->frequency, scale_cpu); + freq = map_util_freq(max_util, ref_freq, scale_cpu);
/* * Find the lowest performance state of the Energy Model above the
From: Vincent Guittot vincent.guittot@linaro.org
[ Upstream commit f60a631ab9ed5df15e446269ea515f2b8948ba0c ]
When a CPU is taken offline, the contribution of its cfs_rqs to task_groups' load may remain and will negatively impact the calculation of the share of the online CPUs.
To fix this bug, clear the contribution of an offlining CPU to task groups' load and skip its contribution while it is inactive.
Here's the reproducer of the anomaly, by Imran Khan:
"So far I have encountered only one rather lengthy way of reproducing this issue, which is as follows:
1. Take a KVM guest (booted with 4 CPUs and can be scaled up to 124 CPUs) and create 2 custom cgroups: /sys/fs/cgroup/cpu/test_group_1 and /sys/fs/cgroup/ cpu/test_group_2
2. Assign a CPU intensive workload to each of these cgroups and start the workload.
For my tests I am using following app:
int main(int argc, char *argv[]) { unsigned long count, i, val; if (argc != 2) { printf("usage: ./a.out <number of random nums to generate> \n"); return 0; }
count = strtoul(argv[1], NULL, 10);
printf("Generating %lu random numbers \n", count); for (i = 0; i < count; i++) { val = rand(); val = val % 2; //usleep(1); } printf("Generated %lu random numbers \n", count); return 0; }
Also since the system is booted with 4 CPUs, in order to completely load the system I am also launching 4 instances of same test app under:
/sys/fs/cgroup/cpu/
3. We can see that both of the cgroups get similar CPU time:
# systemd-cgtop --depth 1 Path Tasks %CPU Memory Input/s Output/s / 659 - 5.5G - - /system.slice - - 5.7G - - /test_group_1 4 - - - - /test_group_2 3 - - - - /user.slice 31 - 56.5M - -
Path Tasks %CPU Memory Input/s Output/s / 659 394.6 5.5G - - /test_group_2 3 65.7 - - - /user.slice 29 55.1 48.0M - - /test_group_1 4 47.3 - - - /system.slice - 2.2 5.7G - -
Path Tasks %CPU Memory Input/s Output/s / 659 394.8 5.5G - - /test_group_1 4 62.9 - - - /user.slice 28 44.9 54.2M - - /test_group_2 3 44.7 - - - /system.slice - 0.9 5.7G - -
Path Tasks %CPU Memory Input/s Output/s / 659 394.4 5.5G - - /test_group_2 3 58.8 - - - /test_group_1 4 51.9 - - - /user.slice 30 39.3 59.6M - - /system.slice - 1.9 5.7G - -
Path Tasks %CPU Memory Input/s Output/s / 659 394.7 5.5G - - /test_group_1 4 60.9 - - - /test_group_2 3 57.9 - - - /user.slice 28 43.5 36.9M - - /system.slice - 3.0 5.7G - -
Path Tasks %CPU Memory Input/s Output/s / 659 395.0 5.5G - - /test_group_1 4 66.8 - - - /test_group_2 3 56.3 - - - /user.slice 29 43.1 51.8M - - /system.slice - 0.7 5.7G - -
4. Now move systemd-udevd to one of these test groups, say test_group_1, and perform scale up to 124 CPUs followed by scale down back to 4 CPUs from the host side.
5. Run the same workload i.e 4 instances of CPU hogger under /sys/fs/cgroup/cpu and one instance of CPU hogger each in /sys/fs/cgroup/cpu/test_group_1 and /sys/fs/cgroup/test_group_2.
It can be seen that test_group_1 (the one where systemd-udevd was moved) is getting much less CPU time than the test_group_2, even though at this point of time both of these groups have only CPU hogger running:
# systemd-cgtop --depth 1 Path Tasks %CPU Memory Input/s Output/s / 1219 - 5.4G - - /system.slice - - 5.6G - - /test_group_1 4 - - - - /test_group_2 3 - - - - /user.slice 26 - 91.3M - -
Path Tasks %CPU Memory Input/s Output/s / 1221 394.3 5.4G - - /test_group_2 3 82.7 - - - /test_group_1 4 14.3 - - - /system.slice - 0.8 5.6G - - /user.slice 26 0.4 91.2M - -
Path Tasks %CPU Memory Input/s Output/s / 1221 394.6 5.4G - - /test_group_2 3 67.4 - - - /system.slice - 24.6 5.6G - - /test_group_1 4 12.5 - - - /user.slice 26 0.4 91.2M - -
Path Tasks %CPU Memory Input/s Output/s / 1221 395.2 5.4G - - /test_group_2 3 60.9 - - - /system.slice - 27.9 5.6G - - /test_group_1 4 12.2 - - - /user.slice 26 0.4 91.2M - -
Path Tasks %CPU Memory Input/s Output/s / 1221 395.2 5.4G - - /test_group_2 3 69.4 - - - /test_group_1 4 13.9 - - - /user.slice 28 1.6 92.0M - - /system.slice - 1.0 5.6G - -
Path Tasks %CPU Memory Input/s Output/s / 1221 395.6 5.4G - - /test_group_2 3 59.3 - - - /test_group_1 4 14.1 - - - /user.slice 28 1.3 92.2M - - /system.slice - 0.7 5.6G - -
Path Tasks %CPU Memory Input/s Output/s / 1221 395.5 5.4G - - /test_group_2 3 67.2 - - - /test_group_1 4 11.5 - - - /user.slice 28 1.3 92.5M - - /system.slice - 0.6 5.6G - -
Path Tasks %CPU Memory Input/s Output/s / 1221 395.1 5.4G - - /test_group_2 3 76.8 - - - /test_group_1 4 12.9 - - - /user.slice 28 1.3 92.8M - - /system.slice - 1.2 5.6G - -
From sched_debug data it can be seen that in bad case the load.weight of per-CPU sched entities corresponding to test_group_1 has reduced significantly and also load_avg of test_group_1 remains much higher than that of test_group_2, even though systemd-udevd stopped running long time back and at this point of time both cgroups just have the CPU hogger app as running entity."
[ mingo: Added details from the original discussion, plus minor edits to the patch. ]
Reported-by: Imran Khan imran.f.khan@oracle.com Tested-by: Imran Khan imran.f.khan@oracle.com Tested-by: Aaron Lu aaron.lu@intel.com Signed-off-by: Vincent Guittot vincent.guittot@linaro.org Signed-off-by: Ingo Molnar mingo@kernel.org Reviewed-by: Imran Khan imran.f.khan@oracle.com Cc: Peter Zijlstra peterz@infradead.org Cc: Borislav Petkov bp@alien8.de Link: https://lore.kernel.org/r/20231223111545.62135-1-vincent.guittot@linaro.org Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/sched/fair.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 44b5262b6657..a07ef7284528 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4098,6 +4098,10 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) if (cfs_rq->tg == &root_task_group) return;
+ /* rq has been offline and doesn't contribute to the share anymore: */ + if (!cpu_active(cpu_of(rq_of(cfs_rq)))) + return; + /* * For migration heavy workloads, access to tg->load_avg can be * unbound. Limit the update rate to at most once per ms. @@ -4114,6 +4118,49 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) } }
+static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq) +{ + long delta; + u64 now; + + /* + * No need to update load_avg for root_task_group, as it is not used. + */ + if (cfs_rq->tg == &root_task_group) + return; + + now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); + delta = 0 - cfs_rq->tg_load_avg_contrib; + atomic_long_add(delta, &cfs_rq->tg->load_avg); + cfs_rq->tg_load_avg_contrib = 0; + cfs_rq->last_update_tg_load_avg = now; +} + +/* CPU offline callback: */ +static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq) +{ + struct task_group *tg; + + lockdep_assert_rq_held(rq); + + /* + * The rq clock has already been updated in + * set_rq_offline(), so we should skip updating + * the rq clock again in unthrottle_cfs_rq(). + */ + rq_clock_start_loop_update(rq); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + clear_tg_load_avg(cfs_rq); + } + rcu_read_unlock(); + + rq_clock_stop_loop_update(rq); +} + /* * Called within set_task_rq() right before setting a task's CPU. The * caller only guarantees p->pi_lock is held; no other assumptions, @@ -4410,6 +4457,8 @@ static inline bool skip_blocked_update(struct sched_entity *se)
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
+static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {} + static inline int propagate_entity_load_avg(struct sched_entity *se) { return 0; @@ -12415,6 +12464,9 @@ static void rq_offline_fair(struct rq *rq)
/* Ensure any throttled groups are reachable by pick_next_task */ unthrottle_offline_cfs_rqs(rq); + + /* Ensure that we remove rq contribution to group share: */ + clear_tg_offline_cfs_rqs(rq); }
#endif /* CONFIG_SMP */
linux-stable-mirror@lists.linaro.org