The intel-pstate driver is using intel_pstate_hwp_set() from two separate paths, i.e. ->set_policy() callback and sysfs update path for the files present in /sys/devices/system/cpu/intel_pstate/ directory.
While an update to the sysfs path applies to all the CPUs being managed by the driver (which essentially means all the online CPUs), the update via the ->set_policy() callback applies to a smaller group of CPUs managed by the policy for which ->set_policy() is called.
And so, intel_pstate_hwp_set() should update frequencies of only the CPUs that are part of policy->cpus mask, while it is called from ->set_policy() callback.
In order to do that, add a parameter (cpumask) to intel_pstate_hwp_set() and apply the frequency changes only to the concerned CPUs.
For ->set_policy() path, we are only concerned about policy->cpus, and so policy->rwsem lock taken by the core prior to calling ->set_policy() is enough to take care of any races. The larger lock acquired by get_online_cpus() is required only for the updates to sysfs files.
Add another routine, intel_pstate_hwp_set_online_cpus(), and call it from the sysfs update paths.
This also fixes a lockdep reported recently, where policy->rwsem and get_online_cpus() could have been acquired in any order causing an ABBA deadlock. The sequence of events leading to that was:
intel_pstate_init(...) ...cpufreq_online(...) down_write(&policy->rwsem); // Locks policy->rwsem ... cpufreq_init_policy(policy); ...intel_pstate_hwp_set(); get_online_cpus(); // Temporarily locks cpu_hotplug.lock ... up_write(&policy->rwsem);
pm_suspend(...) ...disable_nonboot_cpus() _cpu_down() cpu_hotplug_begin(); // Locks cpu_hotplug.lock __cpu_notify(CPU_DOWN_PREPARE, ...); ...cpufreq_offline_prepare(); down_write(&policy->rwsem); // Locks policy->rwsem
Reported-by: Joonas Lahtinen joonas.lahtinen@linux.intel.com Signed-off-by: Viresh Kumar viresh.kumar@linaro.org --- drivers/cpufreq/intel_pstate.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index f4d85c2ae7b1..2e7058a2479d 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -287,7 +287,7 @@ static inline void update_turbo_state(void) cpu->pstate.max_pstate == cpu->pstate.turbo_pstate); }
-static void intel_pstate_hwp_set(void) +static void intel_pstate_hwp_set(const struct cpumask *cpumask) { int min, hw_min, max, hw_max, cpu, range, adj_range; u64 value, cap; @@ -297,9 +297,7 @@ static void intel_pstate_hwp_set(void) hw_max = HWP_HIGHEST_PERF(cap); range = hw_max - hw_min;
- get_online_cpus(); - - for_each_online_cpu(cpu) { + for_each_cpu(cpu, cpumask) { rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value); adj_range = limits->min_perf_pct * range / 100; min = hw_min + adj_range; @@ -318,7 +316,12 @@ static void intel_pstate_hwp_set(void) value |= HWP_MAX_PERF(max); wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value); } +}
+static void intel_pstate_hwp_set_online_cpus(void) +{ + get_online_cpus(); + intel_pstate_hwp_set(cpu_online_mask); put_online_cpus(); }
@@ -440,7 +443,7 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b, limits->no_turbo = clamp_t(int, input, 0, 1);
if (hwp_active) - intel_pstate_hwp_set(); + intel_pstate_hwp_set_online_cpus();
return count; } @@ -466,7 +469,7 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b, int_tofp(100));
if (hwp_active) - intel_pstate_hwp_set(); + intel_pstate_hwp_set_online_cpus(); return count; }
@@ -491,7 +494,7 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b, int_tofp(100));
if (hwp_active) - intel_pstate_hwp_set(); + intel_pstate_hwp_set_online_cpus(); return count; }
@@ -1112,7 +1115,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) pr_debug("intel_pstate: set performance\n"); limits = &performance_limits; if (hwp_active) - intel_pstate_hwp_set(); + intel_pstate_hwp_set(policy->cpus); return 0; }
@@ -1144,7 +1147,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) int_tofp(100));
if (hwp_active) - intel_pstate_hwp_set(); + intel_pstate_hwp_set(policy->cpus);
return 0; }
Hi,
This fixes the issue for my machine, we'll try in our CI system, too.
CC'd Daniel for that. By R-b and T-b below.
On ma, 2016-02-22 at 10:27 +0530, Viresh Kumar wrote:
The intel-pstate driver is using intel_pstate_hwp_set() from two separate paths, i.e. ->set_policy() callback and sysfs update path for the files present in /sys/devices/system/cpu/intel_pstate/ directory.
While an update to the sysfs path applies to all the CPUs being managed by the driver (which essentially means all the online CPUs), the update via the ->set_policy() callback applies to a smaller group of CPUs managed by the policy for which ->set_policy() is called.
And so, intel_pstate_hwp_set() should update frequencies of only the CPUs that are part of policy->cpus mask, while it is called from ->set_policy() callback.
In order to do that, add a parameter (cpumask) to intel_pstate_hwp_set() and apply the frequency changes only to the concerned CPUs.
For ->set_policy() path, we are only concerned about policy->cpus, and so policy->rwsem lock taken by the core prior to calling ->set_policy() is enough to take care of any races. The larger lock acquired by get_online_cpus() is required only for the updates to sysfs files.
Add another routine, intel_pstate_hwp_set_online_cpus(), and call it from the sysfs update paths.
This also fixes a lockdep reported recently, where policy->rwsem and get_online_cpus() could have been acquired in any order causing an ABBA deadlock. The sequence of events leading to that was:
intel_pstate_init(...) ...cpufreq_online(...) down_write(&policy->rwsem); // Locks policy->rwsem ... cpufreq_init_policy(policy); ...intel_pstate_hwp_set(); get_online_cpus(); // Temporarily locks cpu_hotplug.lock ... up_write(&policy->rwsem);
pm_suspend(...) ...disable_nonboot_cpus() _cpu_down() cpu_hotplug_begin(); // Locks cpu_hotplug.lock __cpu_notify(CPU_DOWN_PREPARE, ...); ...cpufreq_offline_prepare(); down_write(&policy->rwsem); // Locks policy->rwsem
Reported-by: Joonas Lahtinen joonas.lahtinen@linux.intel.com
Tested-by: Joonas Lahtinen joonas.lahtinen@linux.intel.com Reviewed-by: Joonas Lahtinen joonas.lahtinen@linux.intel.com
Signed-off-by: Viresh Kumar viresh.kumar@linaro.org
drivers/cpufreq/intel_pstate.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index f4d85c2ae7b1..2e7058a2479d 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -287,7 +287,7 @@ static inline void update_turbo_state(void) cpu->pstate.max_pstate == cpu->pstate.turbo_pstate); } -static void intel_pstate_hwp_set(void) +static void intel_pstate_hwp_set(const struct cpumask *cpumask) { int min, hw_min, max, hw_max, cpu, range, adj_range; u64 value, cap; @@ -297,9 +297,7 @@ static void intel_pstate_hwp_set(void) hw_max = HWP_HIGHEST_PERF(cap); range = hw_max - hw_min;
- get_online_cpus();
- for_each_online_cpu(cpu) {
- for_each_cpu(cpu, cpumask) {
rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value); adj_range = limits->min_perf_pct * range / 100; min = hw_min + adj_range; @@ -318,7 +316,12 @@ static void intel_pstate_hwp_set(void) value |= HWP_MAX_PERF(max); wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value); } +} +static void intel_pstate_hwp_set_online_cpus(void) +{
- get_online_cpus();
- intel_pstate_hwp_set(cpu_online_mask);
put_online_cpus(); } @@ -440,7 +443,7 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b, limits->no_turbo = clamp_t(int, input, 0, 1); if (hwp_active)
intel_pstate_hwp_set();
intel_pstate_hwp_set_online_cpus();
return count; } @@ -466,7 +469,7 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b, int_tofp(100)); if (hwp_active)
intel_pstate_hwp_set();
intel_pstate_hwp_set_online_cpus();
return count; } @@ -491,7 +494,7 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b, int_tofp(100)); if (hwp_active)
intel_pstate_hwp_set();
intel_pstate_hwp_set_online_cpus();
return count; } @@ -1112,7 +1115,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) pr_debug("intel_pstate: set performance\n"); limits = &performance_limits; if (hwp_active)
intel_pstate_hwp_set();
intel_pstate_hwp_set(policy->cpus);
return 0; } @@ -1144,7 +1147,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) int_tofp(100)); if (hwp_active)
intel_pstate_hwp_set();
intel_pstate_hwp_set(policy->cpus);
return 0; }
Hi Kumar,
-----Original Message----- From: linux-pm-owner@vger.kernel.org [mailto:linux-pm- owner@vger.kernel.org] On Behalf Of Viresh Kumar Sent: Monday, February 22, 2016 12:58 PM To: Rafael Wysocki; Srinivas Pandruvada; Len Brown; Viresh Kumar Cc: linaro-kernel@lists.linaro.org; linux-pm@vger.kernel.org; Joonas Lahtinen; linux-kernel@vger.kernel.org Subject: [PATCH] intel-pstate: Update frequencies of policy->cpus only from ->set_policy()
The intel-pstate driver is using intel_pstate_hwp_set() from two separate paths, i.e. ->set_policy() callback and sysfs update path for the files present in /sys/devices/system/cpu/intel_pstate/ directory.
While an update to the sysfs path applies to all the CPUs being managed by the driver (which essentially means all the online CPUs), the update via the -
set_policy() callback applies to a smaller group of CPUs managed by the
policy for which ->set_policy() is called.
And so, intel_pstate_hwp_set() should update frequencies of only the CPUs that are part of policy->cpus mask, while it is called from ->set_policy() callback.
In order to do that, add a parameter (cpumask) to intel_pstate_hwp_set() and apply the frequency changes only to the concerned CPUs.
For ->set_policy() path, we are only concerned about policy->cpus, and so policy->rwsem lock taken by the core prior to calling ->set_policy() is enough to take care of any races. The larger lock acquired by get_online_cpus() is required only for the updates to sysfs files.
IIRC, 1.HWP is hardwarely per-package, CPUs inside one package have one shared HWP. 2.Currently all the CPUs share the same HWP settings according to intel_pstate design. 3. The policy is per-cpu in intel_pstate driver.(policy->cpus only contains one cpu)
So with this patch applied, it is likely CPUs may have different HWP settings? For example: CPU 0 belongs to package A with policy 0, and CPU 1 belongs to package B with policy 1, If you change the policy 0 from powersave to performance, then only CPU0 will update its min/max freq in HWP, however we should also update CPU 2's min/max in HWP settings? Plz correct me if I'm wrong..
thanks, yu
Hi,
I am not really an intel-pstate driver guy, just wrote the patch based on software-review of the stuff :)
On 22-02-16, 10:17, Chen, Yu C wrote:
IIRC, 1.HWP is hardwarely per-package, CPUs inside one package have one shared HWP. 2.Currently all the CPUs share the same HWP settings according to intel_pstate design. 3. The policy is per-cpu in intel_pstate driver.(policy->cpus only contains one cpu)
So with this patch applied, it is likely CPUs may have different HWP settings?
I think the hardware should be able to cope with that, and should be selecting the frequency based on the highest frequency requested for the same package. Otherwise, why should there be an option to supply per-cpu settings ?
For example: CPU 0 belongs to package A with policy 0, and CPU 1 belongs to package B with policy 1, If you change the policy 0 from powersave to performance, then only CPU0 will update its min/max freq in HWP, however we should also update CPU 2's min/max in HWP settings? Plz correct me if I'm wrong..
I will let the official intel-pstate guys reply to that.
On Mon, Feb 22, 2016 at 12:27 PM, Viresh Kumar viresh.kumar@linaro.org wrote:
Hi,
I am not really an intel-pstate driver guy, just wrote the patch based on software-review of the stuff :)
On 22-02-16, 10:17, Chen, Yu C wrote:
IIRC, 1.HWP is hardwarely per-package, CPUs inside one package have one shared HWP. 2.Currently all the CPUs share the same HWP settings according to intel_pstate design. 3. The policy is per-cpu in intel_pstate driver.(policy->cpus only contains one cpu)
So with this patch applied, it is likely CPUs may have different HWP settings?
I think the hardware should be able to cope with that, and should be selecting the frequency based on the highest frequency requested for the same package. Otherwise, why should there be an option to supply per-cpu settings ?
Right.
I can easily imagine a use case in which someone may want to have different ranges for different CPUs.
For example: CPU 0 belongs to package A with policy 0, and CPU 1 belongs to package B with policy 1, If you change the policy 0 from powersave to performance, then only CPU0 will update its min/max freq in HWP, however we should also update CPU 2's min/max in HWP settings? Plz correct me if I'm wrong..
I will let the official intel-pstate guys reply to that.
My opinion is to do what your patch does until that proves to be a problem in practice.
Thanks, Rafael
On Mon, 2016-02-22 at 13:54 +0100, Rafael J. Wysocki wrote:
On Mon, Feb 22, 2016 at 12:27 PM, Viresh Kumar <viresh.kumar@linaro.o rg> wrote:
Hi,
I am not really an intel-pstate driver guy, just wrote the patch based on software-review of the stuff :)
On 22-02-16, 10:17, Chen, Yu C wrote:
IIRC, 1.HWP is hardwarely per-package, CPUs inside one package have one shared HWP. 2.Currently all the CPUs share the same HWP settings according to intel_pstate design. 3. The policy is per-cpu in intel_pstate driver.(policy->cpus only contains one cpu)
So with this patch applied, it is likely CPUs may have different HWP settings?
I think the hardware should be able to cope with that, and should be selecting the frequency based on the highest frequency requested for the same package. Otherwise, why should there be an option to supply per-cpu settings ?
Right.
I can easily imagine a use case in which someone may want to have different ranges for different CPUs.
For example: CPU 0 belongs to package A with policy 0, and CPU 1 belongs to package B with policy 1, If you change the policy 0 from powersave to performance, then only CPU0 will update its min/max freq in HWP, however we should also update CPU 2's min/max in HWP settings? Plz correct me if I'm wrong..
I will let the official intel-pstate guys reply to that.
My opinion is to do what your patch does until that proves to be a problem in practice.
I agree. If someone just changes policy in one CPU, even with current code (before this patch) we have issue, we will change the limits in processor for all online CPUs, but cpufreq core policy will be update for current CPU only. I suggest users to use cpupower like utility if someone want to change policy, which will change for all.
Thanks, Srinivas
Thanks, Rafael -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 02/21/2016 08:57 PM, Viresh Kumar wrote:
The intel-pstate driver is using intel_pstate_hwp_set() from two separate paths, i.e. ->set_policy() callback and sysfs update path for the files present in /sys/devices/system/cpu/intel_pstate/ directory.
While an update to the sysfs path applies to all the CPUs being managed by the driver (which essentially means all the online CPUs), the update via the ->set_policy() callback applies to a smaller group of CPUs managed by the policy for which ->set_policy() is called.
And so, intel_pstate_hwp_set() should update frequencies of only the CPUs that are part of policy->cpus mask, while it is called from ->set_policy() callback.
In order to do that, add a parameter (cpumask) to intel_pstate_hwp_set() and apply the frequency changes only to the concerned CPUs.
For ->set_policy() path, we are only concerned about policy->cpus, and so policy->rwsem lock taken by the core prior to calling ->set_policy() is enough to take care of any races. The larger lock acquired by get_online_cpus() is required only for the updates to sysfs files.
Add another routine, intel_pstate_hwp_set_online_cpus(), and call it from the sysfs update paths.
This also fixes a lockdep reported recently, where policy->rwsem and get_online_cpus() could have been acquired in any order causing an ABBA deadlock. The sequence of events leading to that was:
intel_pstate_init(...) ...cpufreq_online(...) down_write(&policy->rwsem); // Locks policy->rwsem ... cpufreq_init_policy(policy); ...intel_pstate_hwp_set(); get_online_cpus(); // Temporarily locks cpu_hotplug.lock ... up_write(&policy->rwsem);
pm_suspend(...) ...disable_nonboot_cpus() _cpu_down() cpu_hotplug_begin(); // Locks cpu_hotplug.lock __cpu_notify(CPU_DOWN_PREPARE, ...); ...cpufreq_offline_prepare(); down_write(&policy->rwsem); // Locks policy->rwsem
Reported-by: Joonas Lahtinen joonas.lahtinen@linux.intel.com Signed-off-by: Viresh Kumar viresh.kumar@linaro.org
Acked-by: Srinivas Pandruvada srinivas.pandruvada@linux.intel.com
drivers/cpufreq/intel_pstate.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index f4d85c2ae7b1..2e7058a2479d 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -287,7 +287,7 @@ static inline void update_turbo_state(void) cpu->pstate.max_pstate == cpu->pstate.turbo_pstate); } -static void intel_pstate_hwp_set(void) +static void intel_pstate_hwp_set(const struct cpumask *cpumask) { int min, hw_min, max, hw_max, cpu, range, adj_range; u64 value, cap; @@ -297,9 +297,7 @@ static void intel_pstate_hwp_set(void) hw_max = HWP_HIGHEST_PERF(cap); range = hw_max - hw_min;
- get_online_cpus();
- for_each_online_cpu(cpu) {
- for_each_cpu(cpu, cpumask) { rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value); adj_range = limits->min_perf_pct * range / 100; min = hw_min + adj_range;
@@ -318,7 +316,12 @@ static void intel_pstate_hwp_set(void) value |= HWP_MAX_PERF(max); wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value); } +} +static void intel_pstate_hwp_set_online_cpus(void) +{
- get_online_cpus();
- intel_pstate_hwp_set(cpu_online_mask); put_online_cpus(); }
@@ -440,7 +443,7 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b, limits->no_turbo = clamp_t(int, input, 0, 1); if (hwp_active)
intel_pstate_hwp_set();
intel_pstate_hwp_set_online_cpus();
return count; } @@ -466,7 +469,7 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b, int_tofp(100)); if (hwp_active)
intel_pstate_hwp_set();
return count; }intel_pstate_hwp_set_online_cpus();
@@ -491,7 +494,7 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b, int_tofp(100)); if (hwp_active)
intel_pstate_hwp_set();
return count; }intel_pstate_hwp_set_online_cpus();
@@ -1112,7 +1115,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) pr_debug("intel_pstate: set performance\n"); limits = &performance_limits; if (hwp_active)
intel_pstate_hwp_set();
return 0; }intel_pstate_hwp_set(policy->cpus);
@@ -1144,7 +1147,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) int_tofp(100)); if (hwp_active)
intel_pstate_hwp_set();
intel_pstate_hwp_set(policy->cpus);
return 0; }
linaro-kernel@lists.linaro.org