This patch adds PM notifiers for handling suspend/resume of cpufreq governors. This is required for early suspend and late resume of governors.
There are multiple reasons that support this patch: - Firstly it looks very much logical to stop governors when we know we are going into suspend. But the question is when? Is PM notifiers the right place? Following reasons are the supporting hands for this decision. - Nishanth Menon (TI) found an interesting problem on his platform, OMAP. His board wasn't working well with suspend/resume as calls for removing non-boot CPUs was turning out into a call to drivers ->target() which then tries to play with regulators. But regulators and their I2C bus were already suspended and this resulted in a failure. This is why we need a PM notifier here. - Lan Tianyu (Intel) & Jinhyuk Choi (Broadcom) found another issue where tunables configuration for clusters/sockets with non-boot CPUs was getting lost after suspend/resume, as we were notifying governors with CPUFREQ_GOV_POLICY_EXIT on removal of the last cpu for that policy and so deallocating memory for tunables.
All above problems get fixed with having a PM notifier in place which will stop any operation on governor. Hence no need to do any special handling of variables like (frozen) in suspend/resume paths.
Reported-by: Lan Tianyu tianyu.lan@intel.com Reported-by: Nishanth Menon nm@ti.com Reported-by: Jinhyuk Choi jinchoi@broadcom.com Signed-off-by: Viresh Kumar viresh.kumar@linaro.org ---
Hi Guys,
Can you please verify if this fixes issues reported by you? I have tested this for multiple suspend-resumes on my thinkpad. It doesn't crash :)
drivers/cpufreq/cpufreq.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+)
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 02d534d..c87ced9 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -26,6 +26,7 @@ #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> +#include <linux/suspend.h> #include <linux/syscore_ops.h> #include <linux/tick.h> #include <trace/events/power.h> @@ -47,6 +48,9 @@ static LIST_HEAD(cpufreq_policy_list); static DEFINE_PER_CPU(char[CPUFREQ_NAME_LEN], cpufreq_cpu_governor); #endif
+/* Flag to suspend/resume CPUFreq governors */ +static bool cpufreq_suspended; + static inline bool has_target(void) { return cpufreq_driver->target_index || cpufreq_driver->target; @@ -1462,6 +1466,54 @@ static struct subsys_interface cpufreq_interface = { .remove_dev = cpufreq_remove_dev, };
+/* + * PM Notifier for suspending governors as some platforms can't change frequency + * after this point in suspend cycle. Because some of the devices (like: i2c, + * regulators, etc) they use for changing frequency are suspended quickly after + * this point. + */ +static int cpufreq_pm_notify(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct cpufreq_policy *policy; + unsigned long flags; + + if (!has_target()) + return NOTIFY_OK; + + if (action == PM_SUSPEND_PREPARE) { + pr_debug("%s: Suspending Governors\n", __func__); + + list_for_each_entry(policy, &cpufreq_policy_list, policy_list) + if (__cpufreq_governor(policy, CPUFREQ_GOV_STOP)) + pr_err("%s: Failed to stop governor for policy: %p\n", + __func__, policy); + + write_lock_irqsave(&cpufreq_driver_lock, flags); + cpufreq_suspended = true; + write_unlock_irqrestore(&cpufreq_driver_lock, flags); + } else if (action == PM_POST_SUSPEND) { + pr_debug("%s: Resuming Governors\n", __func__); + + write_lock_irqsave(&cpufreq_driver_lock, flags); + cpufreq_suspended = false; + write_unlock_irqrestore(&cpufreq_driver_lock, flags); + + list_for_each_entry(policy, &cpufreq_policy_list, policy_list) + if (__cpufreq_governor(policy, CPUFREQ_GOV_START) || + __cpufreq_governor(policy, + CPUFREQ_GOV_LIMITS)) + pr_err("%s: Failed to start governor for policy: %p\n", + __func__, policy); + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_pm_notifier = { + .notifier_call = cpufreq_pm_notify, +}; + /** * cpufreq_bp_suspend - Prepare the boot CPU for system suspend. * @@ -1752,6 +1804,8 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_target); static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event) { + unsigned long flags; + bool is_suspended; int ret;
/* Only must be defined when default governor is known to have latency @@ -1764,6 +1818,14 @@ static int __cpufreq_governor(struct cpufreq_policy *policy, struct cpufreq_governor *gov = NULL; #endif
+ /* Don't start any governor operations if we are entering suspend */ + read_lock_irqsave(&cpufreq_driver_lock, flags); + is_suspended = cpufreq_suspended; + read_unlock_irqrestore(&cpufreq_driver_lock, flags); + + if (is_suspended) + return 0; + if (policy->governor->max_transition_latency && policy->cpuinfo.transition_latency > policy->governor->max_transition_latency) { @@ -2222,6 +2284,7 @@ static int __init cpufreq_core_init(void) cpufreq_global_kobject = kobject_create(); BUG_ON(!cpufreq_global_kobject); register_syscore_ops(&cpufreq_syscore_ops); + register_pm_notifier(&cpufreq_pm_notifier);
return 0; }
On 11/15/2013 04:12 AM, Viresh Kumar wrote:
This patch adds PM notifiers for handling suspend/resume of cpufreq governors. This is required for early suspend and late resume of governors.
There are multiple reasons that support this patch:
- Firstly it looks very much logical to stop governors when we know we are going into suspend. But the question is when? Is PM notifiers the right place? Following reasons are the supporting hands for this decision.
- Nishanth Menon (TI) found an interesting problem on his platform, OMAP. His board wasn't working well with suspend/resume as calls for removing non-boot CPUs was turning out into a call to drivers ->target() which then tries to play with regulators. But regulators and their I2C bus were already suspended and this resulted in a failure. This is why we need a PM notifier here.
- Lan Tianyu (Intel) & Jinhyuk Choi (Broadcom) found another issue where tunables configuration for clusters/sockets with non-boot CPUs was getting lost after suspend/resume, as we were notifying governors with CPUFREQ_GOV_POLICY_EXIT on removal of the last cpu for that policy and so deallocating memory for tunables.
All above problems get fixed with having a PM notifier in place which will stop any operation on governor. Hence no need to do any special handling of variables like (frozen) in suspend/resume paths.
Reported-by: Lan Tianyu tianyu.lan@intel.com Reported-by: Nishanth Menon nm@ti.com Reported-by: Jinhyuk Choi jinchoi@broadcom.com Signed-off-by: Viresh Kumar viresh.kumar@linaro.org
Hi Guys,
Can you please verify if this fixes issues reported by you? I have tested this for multiple suspend-resumes on my thinkpad. It doesn't crash :)
Yes, this does fix the issue for me.
Tested-by: Nishanth Menon nm@ti.com based on vendor kernel on top of v3.12 tag (equivalent diff: http://pastebin.mozilla.org/3609407)
OMAP5-uEVM(OMAP5432): http://pastebin.mozilla.org/3609396
drivers/cpufreq/cpufreq.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+)
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 02d534d..c87ced9 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -26,6 +26,7 @@ #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> +#include <linux/suspend.h> #include <linux/syscore_ops.h> #include <linux/tick.h> #include <trace/events/power.h> @@ -47,6 +48,9 @@ static LIST_HEAD(cpufreq_policy_list); static DEFINE_PER_CPU(char[CPUFREQ_NAME_LEN], cpufreq_cpu_governor); #endif +/* Flag to suspend/resume CPUFreq governors */ +static bool cpufreq_suspended;
static inline bool has_target(void) { return cpufreq_driver->target_index || cpufreq_driver->target; @@ -1462,6 +1466,54 @@ static struct subsys_interface cpufreq_interface = { .remove_dev = cpufreq_remove_dev, }; +/*
- PM Notifier for suspending governors as some platforms can't change frequency
- after this point in suspend cycle. Because some of the devices (like: i2c,
- regulators, etc) they use for changing frequency are suspended quickly after
- this point.
- */
+static int cpufreq_pm_notify(struct notifier_block *nb, unsigned long action,
void *data)
+{
- struct cpufreq_policy *policy;
- unsigned long flags;
- if (!has_target())
return NOTIFY_OK;
- if (action == PM_SUSPEND_PREPARE) {
pr_debug("%s: Suspending Governors\n", __func__);
list_for_each_entry(policy, &cpufreq_policy_list, policy_list)
if (__cpufreq_governor(policy, CPUFREQ_GOV_STOP))
pr_err("%s: Failed to stop governor for policy: %p\n",
__func__, policy);
write_lock_irqsave(&cpufreq_driver_lock, flags);
cpufreq_suspended = true;
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
- } else if (action == PM_POST_SUSPEND) {
pr_debug("%s: Resuming Governors\n", __func__);
write_lock_irqsave(&cpufreq_driver_lock, flags);
cpufreq_suspended = false;
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
list_for_each_entry(policy, &cpufreq_policy_list, policy_list)
if (__cpufreq_governor(policy, CPUFREQ_GOV_START) ||
__cpufreq_governor(policy,
CPUFREQ_GOV_LIMITS))
pr_err("%s: Failed to start governor for policy: %p\n",
__func__, policy);
- }
- return NOTIFY_OK;
+}
+static struct notifier_block cpufreq_pm_notifier = {
- .notifier_call = cpufreq_pm_notify,
+};
/**
- cpufreq_bp_suspend - Prepare the boot CPU for system suspend.
@@ -1752,6 +1804,8 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_target); static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event) {
- unsigned long flags;
- bool is_suspended; int ret;
/* Only must be defined when default governor is known to have latency @@ -1764,6 +1818,14 @@ static int __cpufreq_governor(struct cpufreq_policy *policy, struct cpufreq_governor *gov = NULL; #endif
- /* Don't start any governor operations if we are entering suspend */
- read_lock_irqsave(&cpufreq_driver_lock, flags);
- is_suspended = cpufreq_suspended;
- read_unlock_irqrestore(&cpufreq_driver_lock, flags);
- if (is_suspended)
return 0;
- if (policy->governor->max_transition_latency && policy->cpuinfo.transition_latency > policy->governor->max_transition_latency) {
@@ -2222,6 +2284,7 @@ static int __init cpufreq_core_init(void) cpufreq_global_kobject = kobject_create(); BUG_ON(!cpufreq_global_kobject); register_syscore_ops(&cpufreq_syscore_ops);
- register_pm_notifier(&cpufreq_pm_notifier);
return 0; }
On 15 November 2013 19:18, Nishanth Menon nm@ti.com wrote:
Yes, this does fix the issue for me.
Tested-by: Nishanth Menon nm@ti.com based on vendor kernel on top of v3.12 tag (equivalent diff: http://pastebin.mozilla.org/3609407)
OMAP5-uEVM(OMAP5432): http://pastebin.mozilla.org/3609396
Thanks a lot for verifying buddy!! Glad that your issue is fixed..
On Friday, November 15, 2013 03:42:29 PM Viresh Kumar wrote:
This patch adds PM notifiers for handling suspend/resume of cpufreq governors. This is required for early suspend and late resume of governors.
There are multiple reasons that support this patch:
- Firstly it looks very much logical to stop governors when we know we are going into suspend. But the question is when? Is PM notifiers the right place? Following reasons are the supporting hands for this decision.
- Nishanth Menon (TI) found an interesting problem on his platform, OMAP. His board wasn't working well with suspend/resume as calls for removing non-boot CPUs was turning out into a call to drivers ->target() which then tries to play with regulators. But regulators and their I2C bus were already suspended and this resulted in a failure. This is why we need a PM notifier here.
- Lan Tianyu (Intel) & Jinhyuk Choi (Broadcom) found another issue where tunables configuration for clusters/sockets with non-boot CPUs was getting lost after suspend/resume, as we were notifying governors with CPUFREQ_GOV_POLICY_EXIT on removal of the last cpu for that policy and so deallocating memory for tunables.
All above problems get fixed with having a PM notifier in place which will stop any operation on governor. Hence no need to do any special handling of variables like (frozen) in suspend/resume paths.
Will cpufreq work during system-wide power transitions (suspend/resume etc.) after this? In particular, what about hibernation?
Reported-by: Lan Tianyu tianyu.lan@intel.com Reported-by: Nishanth Menon nm@ti.com Reported-by: Jinhyuk Choi jinchoi@broadcom.com Signed-off-by: Viresh Kumar viresh.kumar@linaro.org
Hi Guys,
Can you please verify if this fixes issues reported by you? I have tested this for multiple suspend-resumes on my thinkpad. It doesn't crash :)
drivers/cpufreq/cpufreq.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+)
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 02d534d..c87ced9 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -26,6 +26,7 @@ #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> +#include <linux/suspend.h> #include <linux/syscore_ops.h> #include <linux/tick.h> #include <trace/events/power.h> @@ -47,6 +48,9 @@ static LIST_HEAD(cpufreq_policy_list); static DEFINE_PER_CPU(char[CPUFREQ_NAME_LEN], cpufreq_cpu_governor); #endif +/* Flag to suspend/resume CPUFreq governors */ +static bool cpufreq_suspended;
static inline bool has_target(void) { return cpufreq_driver->target_index || cpufreq_driver->target; @@ -1462,6 +1466,54 @@ static struct subsys_interface cpufreq_interface = { .remove_dev = cpufreq_remove_dev, }; +/*
- PM Notifier for suspending governors as some platforms can't change frequency
- after this point in suspend cycle. Because some of the devices (like: i2c,
- regulators, etc) they use for changing frequency are suspended quickly after
- this point.
- */
+static int cpufreq_pm_notify(struct notifier_block *nb, unsigned long action,
void *data)
+{
- struct cpufreq_policy *policy;
- unsigned long flags;
- if (!has_target())
return NOTIFY_OK;
- if (action == PM_SUSPEND_PREPARE) {
pr_debug("%s: Suspending Governors\n", __func__);
list_for_each_entry(policy, &cpufreq_policy_list, policy_list)
if (__cpufreq_governor(policy, CPUFREQ_GOV_STOP))
pr_err("%s: Failed to stop governor for policy: %p\n",
__func__, policy);
write_lock_irqsave(&cpufreq_driver_lock, flags);
cpufreq_suspended = true;
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
- } else if (action == PM_POST_SUSPEND) {
pr_debug("%s: Resuming Governors\n", __func__);
write_lock_irqsave(&cpufreq_driver_lock, flags);
cpufreq_suspended = false;
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
list_for_each_entry(policy, &cpufreq_policy_list, policy_list)
if (__cpufreq_governor(policy, CPUFREQ_GOV_START) ||
__cpufreq_governor(policy,
CPUFREQ_GOV_LIMITS))
pr_err("%s: Failed to start governor for policy: %p\n",
__func__, policy);
- }
- return NOTIFY_OK;
+}
+static struct notifier_block cpufreq_pm_notifier = {
- .notifier_call = cpufreq_pm_notify,
+};
/**
- cpufreq_bp_suspend - Prepare the boot CPU for system suspend.
@@ -1752,6 +1804,8 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_target); static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event) {
- unsigned long flags;
- bool is_suspended; int ret;
/* Only must be defined when default governor is known to have latency @@ -1764,6 +1818,14 @@ static int __cpufreq_governor(struct cpufreq_policy *policy, struct cpufreq_governor *gov = NULL; #endif
- /* Don't start any governor operations if we are entering suspend */
- read_lock_irqsave(&cpufreq_driver_lock, flags);
- is_suspended = cpufreq_suspended;
- read_unlock_irqrestore(&cpufreq_driver_lock, flags);
- if (is_suspended)
return 0;
- if (policy->governor->max_transition_latency && policy->cpuinfo.transition_latency > policy->governor->max_transition_latency) {
@@ -2222,6 +2284,7 @@ static int __init cpufreq_core_init(void) cpufreq_global_kobject = kobject_create(); BUG_ON(!cpufreq_global_kobject); register_syscore_ops(&cpufreq_syscore_ops);
- register_pm_notifier(&cpufreq_pm_notifier);
return 0; }
On Saturday 16 November 2013 05:54 AM, Rafael J. Wysocki wrote:
Will cpufreq work during system-wide power transitions (suspend/resume etc.) after this? In particular, what about hibernation?
I am disabling governors as soon as we start suspend. So No, cpufreq wouldn't work during suspend/resume. But once system is resumed we are starting it back again.
Hibernation is probably not supported by my code yet.. I just went through hibernation code quickly and it looks we don't send PM_SUSPEND_PREPARE or PM_POST_SUSPEND notifications in case of hibernation. Correct?
And these are the notifications that we send: - PM_HIBERNATION_PREPARE - PM_POST_HIBERNATION - PM_RESTORE_PREPARE - PM_POST_RESTORE
If I am not wrong I need to stop governors on PM_HIBERNATION_PREPARE and need to start them back on: PM_POST_HIBERNATION (I am a bit confused with this one. Does this POST_HIBERNATION one happens at the end of going into hibernation? or after booting back? I need a notifier at the end of restore)..
-- viresh
On Saturday, November 16, 2013 10:01:50 AM viresh kumar wrote:
On Saturday 16 November 2013 05:54 AM, Rafael J. Wysocki wrote:
Will cpufreq work during system-wide power transitions (suspend/resume etc.) after this? In particular, what about hibernation?
I am disabling governors as soon as we start suspend. So No, cpufreq wouldn't work during suspend/resume. But once system is resumed we are starting it back again.
Well, disabling it for the whole duration of suspend/resume and/or hibernation may not be the right approach entirely, unless we force the pax perf of the boot CPU at least in addition to that. Otherwise the latency of suspend and the subsequent resume will depend on what perf level the CPUs where before disabling the governors, which is not desirable at al.
Hibernation is probably not supported by my code yet.. I just went through hibernation code quickly and it looks we don't send PM_SUSPEND_PREPARE or PM_POST_SUSPEND notifications in case of hibernation. Correct?
Yes, that's correct.
And these are the notifications that we send:
- PM_HIBERNATION_PREPARE
- PM_POST_HIBERNATION
- PM_RESTORE_PREPARE
- PM_POST_RESTORE
If I am not wrong I need to stop governors on PM_HIBERNATION_PREPARE and need to start them back on: PM_POST_HIBERNATION (I am a bit confused with this one. Does this POST_HIBERNATION one happens at the end of going into hibernation? or after booting back? I need a notifier at the end of restore)..
You'd need both PM_POST_HIBERNATION and PM_POST_RESTORE, but I wouldn't really like cpufreq governors to be disabled throughout the whole hibernation.
Actually, we use CPU offline/online during system suspend/resume to avoid having to do stuff like this from PM notifiers. So I'd like the original problem to be addressed in a different way.
Thanks!
PS The sisk.pl address will start bouncing shortly, so please replace it with rjw@rjwysocki.net in your address book.
On 16 November 2013 19:59, Rafael J. Wysocki rjw@rjwysocki.net wrote:
Well, disabling it for the whole duration of suspend/resume and/or hibernation may not be the right approach entirely, unless we force the pax perf of the
s/pax/max ?
boot CPU at least in addition to that. Otherwise the latency of suspend and the subsequent resume will depend on what perf level the CPUs where before disabling the governors, which is not desirable at al.
Well that is pretty much doable.
And these are the notifications that we send:
- PM_HIBERNATION_PREPARE
- PM_POST_HIBERNATION
- PM_RESTORE_PREPARE
- PM_POST_RESTORE
If I am not wrong I need to stop governors on PM_HIBERNATION_PREPARE and need to start them back on: PM_POST_HIBERNATION (I am a bit confused with this one. Does this POST_HIBERNATION one happens at the end of going into hibernation? or after booting back? I need a notifier at the end of restore)..
You'd need both PM_POST_HIBERNATION and PM_POST_RESTORE, but I wouldn't really like cpufreq governors to be disabled throughout the whole hibernation.
So PM_POST_HIBERNATION is called just before shutting off the system? And PM_POST_RESTORE is called after system is resumed from saved image?
Actually, we use CPU offline/online during system suspend/resume to avoid having to do stuff like this from PM notifiers.
I didn't get the logic behind this one..
So I'd like the original problem to be addressed in a different way.
Hmm..
PS The sisk.pl address will start bouncing shortly, so please replace it with rjw@rjwysocki.net in your address book.
Okay...
On Saturday, November 16, 2013 08:47:24 PM Viresh Kumar wrote:
On 16 November 2013 19:59, Rafael J. Wysocki rjw@rjwysocki.net wrote:
Well, disabling it for the whole duration of suspend/resume and/or hibernation may not be the right approach entirely, unless we force the pax perf of the
s/pax/max ?
Yes.
boot CPU at least in addition to that. Otherwise the latency of suspend and the subsequent resume will depend on what perf level the CPUs where before disabling the governors, which is not desirable at al.
Well that is pretty much doable.
Not necessarily on all CPU models.
And these are the notifications that we send:
- PM_HIBERNATION_PREPARE
- PM_POST_HIBERNATION
- PM_RESTORE_PREPARE
- PM_POST_RESTORE
If I am not wrong I need to stop governors on PM_HIBERNATION_PREPARE and need to start them back on: PM_POST_HIBERNATION (I am a bit confused with this one. Does this POST_HIBERNATION one happens at the end of going into hibernation? or after booting back? I need a notifier at the end of restore)..
You'd need both PM_POST_HIBERNATION and PM_POST_RESTORE, but I wouldn't really like cpufreq governors to be disabled throughout the whole hibernation.
So PM_POST_HIBERNATION is called just before shutting off the system? And PM_POST_RESTORE is called after system is resumed from saved image?
PM_POST_HIBERNATION is only called if there's an error during hibernation. PM_POST_RESTORE is called as you said.
Also you have to remember that the _PREPARE PM notifiers are called before the freezing of tasks when user space is still running, so disabling governors at that point may lead to some weird behavior.
Actually, we use CPU offline/online during system suspend/resume to avoid having to do stuff like this from PM notifiers.
I didn't get the logic behind this one..
If we have to do special stuff from PM notifiers for CPU "suspend", we will be better off by doing something entirely special instead of CPU offline down the road. Which we may end up doing given the problems with frozen/not frozen in the cpufreq core.
We may introduce suspend_noirq and resume_noirq for cpu_subsys, for example, and handle things from there. Or something similar. But slapping PM notifiers on top of the existing code just because it appears to be easy (and making that code even more overdesigned than it already is this way) doesn't seem quite right.
Now, the Tianyu's patch extends the Srivatsa's approach to governors, which actually should have been done from the outset, so it is within the scope of what we have already. It may not solve all of the problems, but it still makes some progress and has a little chance to introduce *new* problems at the same time.
Thanks!
On Sunday 17 November 2013 06:38 AM, Rafael J. Wysocki wrote:
On Saturday, November 16, 2013 08:47:24 PM Viresh Kumar wrote:
Well that is pretty much doable.
Not necessarily on all CPU models.
Okay.. Just for my understanding, why?
So PM_POST_HIBERNATION is called just before shutting off the system? And PM_POST_RESTORE is called after system is resumed from saved image?
PM_POST_HIBERNATION is only called if there's an error during hibernation. PM_POST_RESTORE is called as you said.
Ahh I see. Thanks.
Also you have to remember that the _PREPARE PM notifiers are called before the freezing of tasks when user space is still running, so disabling governors at that point may lead to some weird behavior.
Actually good point. I haven't thought about it earlier.
And when I see what bad can happen, I couldn't find much. The worst is that we wouldn't go to a frequency requested by userspace daemon. But we wouldn't send an error then. But I feel we can let that happen. Not servicing a request after we have started system suspend doesn't look that odd..
Sysfs infrastructure is still preserved and so all that information would still be available.
Do you see anything extra that might stop working?
Actually, we use CPU offline/online during system suspend/resume to avoid having to do stuff like this from PM notifiers.
I didn't get the logic behind this one..
If we have to do special stuff from PM notifiers for CPU "suspend", we will be better off by doing something entirely special instead of CPU offline down the road. Which we may end up doing given the problems with frozen/not frozen in the cpufreq core.
A unrelated question here. Why are we offlining CPUs after suspending all the devices? Because the problem Nishanth mentioned was that he required few devices, i2c, to be available when CPUs are getting down. And there might be similar requirements at other places too. Was there any specific bottleneck due to which it is implemented this way?
We may introduce suspend_noirq and resume_noirq for cpu_subsys, for example, and handle things from there. Or something similar. But slapping PM notifiers on top of the existing code just because it appears to be easy (and making that code even more overdesigned than it already is this way) doesn't seem quite right.
Now, the Tianyu's patch extends the Srivatsa's approach to governors, which actually should have been done from the outset, so it is within the scope of what we have already. It may not solve all of the problems, but it still makes some progress and has a little chance to introduce *new* problems at the same time.
I understand your point here. But this is what I feel:
- I don't have any special affection for using PM notifiers :) .. Its just that I need some way for cpufreq core to know that Suspend has started. Maybe after freezing of tasks and before removal of devices.
- I thought of adding something like a suspend-prepare for syscore_ops (You are owner of all these frameworks and so our life is easy as we can discuss stuff with you directly :)).. But then thought maybe we can use PM notifiers.. But it looks that we better do that now ?
- I have concerns with Tianyu's patch as policies should be better taken care of in cpufreq core instead of passing them over to governors. And with the alternative solution I had, code is getting more and more dirty. And so I thought of doing something else.
- Not all platforms have problem with changing frequency during suspend/resume and so we may not require disabling of governors for all of them. Probably can add another field based on which we may/may-not disable governors from PM or syscore notifiers.
-- viresh
On Sunday, November 17, 2013 01:52:15 PM viresh kumar wrote:
On Sunday 17 November 2013 06:38 AM, Rafael J. Wysocki wrote:
On Saturday, November 16, 2013 08:47:24 PM Viresh Kumar wrote:
Well that is pretty much doable.
Not necessarily on all CPU models.
Okay.. Just for my understanding, why?
If the graphics and processor are in one chip, the CPU may ignore your perf bump up request for power balancing reasons.
So PM_POST_HIBERNATION is called just before shutting off the system? And PM_POST_RESTORE is called after system is resumed from saved image?
PM_POST_HIBERNATION is only called if there's an error during hibernation. PM_POST_RESTORE is called as you said.
Ahh I see. Thanks.
Also you have to remember that the _PREPARE PM notifiers are called before the freezing of tasks when user space is still running, so disabling governors at that point may lead to some weird behavior.
Actually good point. I haven't thought about it earlier.
And when I see what bad can happen, I couldn't find much. The worst is that we wouldn't go to a frequency requested by userspace daemon. But we wouldn't send an error then. But I feel we can let that happen. Not servicing a request after we have started system suspend doesn't look that odd..
Sysfs infrastructure is still preserved and so all that information would still be available.
Do you see anything extra that might stop working?
Well, the code would be racy with the patch as is. User space might manipulate the sysfs knobs in parallel with your PM notifiers, for example, and I'm not entirely sure what can happen then. And the lock in there is pointless, because it doesn't prevent any races from happening.
Actually, we use CPU offline/online during system suspend/resume to avoid having to do stuff like this from PM notifiers.
I didn't get the logic behind this one..
If we have to do special stuff from PM notifiers for CPU "suspend", we will be better off by doing something entirely special instead of CPU offline down the road. Which we may end up doing given the problems with frozen/not frozen in the cpufreq core.
A unrelated question here. Why are we offlining CPUs after suspending all the devices? Because the problem Nishanth mentioned was that he required few devices, i2c, to be available when CPUs are getting down. And there might be similar requirements at other places too. Was there any specific bottleneck due to which it is implemented this way?
No, this is because the ACPI spec mandates powering down devices before CPUs during system suspend. The way it is done today, however, I think we don't need to keep that ordering so strictly any more. We definitely don't need to do that on non-ACPI systems.
So while I hate the PM notifiers idea (sorry, but that's how it goes), I think it would be OK to suspend *some* devices after disabling CPUs (not all of them, of course).
And as I said, I think it would be OK to introduce suspend/resume callbacks for CPU devices and use those callbacks to work around the ordering issues, when necessary. The main point is that the changes made for this purpose should only affect systems where they are necessary and not everyone. I don't want to change the way things work today in general in cpufreq too much unless they are plain bugs that affect everyone.
We may introduce suspend_noirq and resume_noirq for cpu_subsys, for example, and handle things from there. Or something similar. But slapping PM notifiers on top of the existing code just because it appears to be easy (and making that code even more overdesigned than it already is this way) doesn't seem quite right.
Now, the Tianyu's patch extends the Srivatsa's approach to governors, which actually should have been done from the outset, so it is within the scope of what we have already. It may not solve all of the problems, but it still makes some progress and has a little chance to introduce *new* problems at the same time.
I understand your point here. But this is what I feel:
- I don't have any special affection for using PM notifiers :) .. Its just that
I need some way for cpufreq core to know that Suspend has started. Maybe after freezing of tasks and before removal of devices.
- I thought of adding something like a suspend-prepare for syscore_ops (You are
owner of all these frameworks and so our life is easy as we can discuss stuff with you directly :)).. But then thought maybe we can use PM notifiers.. But it looks that we better do that now ?
- I have concerns with Tianyu's patch as policies should be better taken care of
in cpufreq core instead of passing them over to governors.
Well, this is all too tangled anyway, but quite frankly I'm not sure if it is worth untangling at this point. We're deprecating cpufreq anyway.
And with the alternative solution I had, code is getting more and more dirty. And so I thought of doing something else.
- Not all platforms have problem with changing frequency during suspend/resume
and so we may not require disabling of governors for all of them. Probably can add another field based on which we may/may-not disable governors from PM or syscore notifiers.
What exactly is wrong with adding suspend/resume callbacks to cpu_subsys?
On 17 November 2013 20:39, Rafael J. Wysocki rjw@rjwysocki.net wrote:
On Sunday, November 17, 2013 01:52:15 PM viresh kumar wrote:
Do you see anything extra that might stop working?
Well, the code would be racy with the patch as is. User space might manipulate the sysfs knobs in parallel with your PM notifiers, for example, and I'm not entirely sure what can happen then. And the lock in there is pointless, because it doesn't prevent any races from happening.
Hmm..
A unrelated question here. Why are we offlining CPUs after suspending all the devices? Because the problem Nishanth mentioned was that he required few devices, i2c, to be available when CPUs are getting down. And there might be similar requirements at other places too. Was there any specific bottleneck due to which it is implemented this way?
No, this is because the ACPI spec mandates powering down devices before CPUs during system suspend. The way it is done today, however, I think we don't need to keep that ordering so strictly any more. We definitely don't need to do that on non-ACPI systems.
Okay.. But different behavior for acpi and non-acpi at such core code doesn't look great to me.. Lets see if we can work with existing code
So while I hate the PM notifiers idea (sorry, but that's how it goes), I think it would be OK to suspend *some* devices after disabling CPUs (not all of them, of course).
Hmm...
And as I said, I think it would be OK to introduce suspend/resume callbacks for CPU devices and use those callbacks to work around the ordering issues, when necessary. The main point is that the changes made for this purpose should only affect systems where they are necessary and not everyone. I don't want to change the way things work today in general in cpufreq too much unless they are plain bugs that affect everyone.
We may introduce suspend_noirq and resume_noirq for cpu_subsys, for example, and handle things from there. Or something similar. But slapping PM notifiers on top of the existing code just because it appears to be easy (and making that code even more overdesigned than it already is this way) doesn't seem quite right.
Okay.. Even these notifiers would be fine for me. To make things more clear before I start implementing them: - What about implementing syscore's suspend_prepare and post_suspend? - Or you want to extend only CPU subsystems notifiers? What notifiers exactly? And at which places we want to issue them from?
- I have concerns with Tianyu's patch as policies should be better taken care of
in cpufreq core instead of passing them over to governors.
Well, this is all too tangled anyway, but quite frankly I'm not sure if it is worth untangling at this point. We're deprecating cpufreq anyway.
I don't really think cpufreq is going anywhere even after we move bits into scheduler. All the backend CPUFreq stuff will probably stay as is, as they can't go anywhere.
One thing that will surely go away is the governor's layer, which would be directly embedded into scheduler.
Lets see how that goes..
- Not all platforms have problem with changing frequency during suspend/resume
and so we may not require disabling of governors for all of them. Probably can add another field based on which we may/may-not disable governors from PM or syscore notifiers.
What exactly is wrong with adding suspend/resume callbacks to cpu_subsys?
Okay, so you were asking about extending following list: CPU_ONLINE, CPU_UP_PREPARE, CPU_UP_CANCELED, CPU_DOWN_PREPARE, etc.. to include suspend/resume ones as well?
Logically speaking, all existing ones does look correct as they are more or less cpu related. But suspend/resume doesn't look any similar, Atleast to me.
Suspend/resume are system's state rather than CPU's.. We aren't suspending or resuming CPUs, we are shutting them off.. So I thought maybe syscore ops is a better place (which is already used by cpufreq)..
On Sunday, November 17, 2013 10:27:43 PM Viresh Kumar wrote:
On 17 November 2013 20:39, Rafael J. Wysocki rjw@rjwysocki.net wrote:
On Sunday, November 17, 2013 01:52:15 PM viresh kumar wrote:
Do you see anything extra that might stop working?
Well, the code would be racy with the patch as is. User space might manipulate the sysfs knobs in parallel with your PM notifiers, for example, and I'm not entirely sure what can happen then. And the lock in there is pointless, because it doesn't prevent any races from happening.
Hmm..
A unrelated question here. Why are we offlining CPUs after suspending all the devices? Because the problem Nishanth mentioned was that he required few devices, i2c, to be available when CPUs are getting down. And there might be similar requirements at other places too. Was there any specific bottleneck due to which it is implemented this way?
No, this is because the ACPI spec mandates powering down devices before CPUs during system suspend. The way it is done today, however, I think we don't need to keep that ordering so strictly any more. We definitely don't need to do that on non-ACPI systems.
Okay.. But different behavior for acpi and non-acpi at such core code doesn't look great to me.. Lets see if we can work with existing code
So while I hate the PM notifiers idea (sorry, but that's how it goes), I think it would be OK to suspend *some* devices after disabling CPUs (not all of them, of course).
Hmm...
And as I said, I think it would be OK to introduce suspend/resume callbacks for CPU devices and use those callbacks to work around the ordering issues, when necessary. The main point is that the changes made for this purpose should only affect systems where they are necessary and not everyone. I don't want to change the way things work today in general in cpufreq too much unless they are plain bugs that affect everyone.
We may introduce suspend_noirq and resume_noirq for cpu_subsys, for example, and handle things from there. Or something similar. But slapping PM notifiers on top of the existing code just because it appears to be easy (and making that code even more overdesigned than it already is this way) doesn't seem quite right.
Okay.. Even these notifiers would be fine for me. To make things more clear before I start implementing them:
- What about implementing syscore's suspend_prepare and post_suspend?
I'm not sure how useful that would be. When would you like to execute those things?
- Or you want to extend only CPU subsystems notifiers? What notifiers exactly?
And at which places we want to issue them from?
Why do we need to use notifiers? What about PM callbacks?
- I have concerns with Tianyu's patch as policies should be better taken care of
in cpufreq core instead of passing them over to governors.
Well, this is all too tangled anyway, but quite frankly I'm not sure if it is worth untangling at this point. We're deprecating cpufreq anyway.
I don't really think cpufreq is going anywhere even after we move bits into scheduler. All the backend CPUFreq stuff will probably stay as is, as they can't go anywhere.
One thing that will surely go away is the governor's layer, which would be directly embedded into scheduler.
Lets see how that goes..
- Not all platforms have problem with changing frequency during suspend/resume
and so we may not require disabling of governors for all of them. Probably can add another field based on which we may/may-not disable governors from PM or syscore notifiers.
What exactly is wrong with adding suspend/resume callbacks to cpu_subsys?
Okay, so you were asking about extending following list: CPU_ONLINE, CPU_UP_PREPARE, CPU_UP_CANCELED, CPU_DOWN_PREPARE, etc.. to include suspend/resume ones as well?
No. Bus types (among other things) may provide suspend/resume callbacks for handling devices. We have a bus type for CPUs, which is called cpu_subsys and currently doesn't define any PM callbacks, although it could do that in principle. Have you investigated that possibility?
Logically speaking, all existing ones does look correct as they are more or less cpu related. But suspend/resume doesn't look any similar, Atleast to me.
Suspend/resume are system's state rather than CPU's.. We aren't suspending or resuming CPUs, we are shutting them off.. So I thought maybe syscore ops is a better place (which is already used by cpufreq)..
cpufreq uses syscore_ops for the boot CPU only and that admittedly is a hack. syscore_ops is specifically for things that have to be suspended with only one CPU online and with interrupts off. I'm not sure how that applies to cpufreq.
Thanks!
On 18 November 2013 03:07, Rafael J. Wysocki rjw@rjwysocki.net wrote:
On Sunday, November 17, 2013 10:27:43 PM Viresh Kumar wrote:
Okay.. Even these notifiers would be fine for me. To make things more clear before I start implementing them:
- What about implementing syscore's suspend_prepare and post_suspend?
I'm not sure how useful that would be. When would you like to execute those things?
Maybe after freezing userspace. So I was actually looking to move the existing code I wrote in PM notifiers to those..
Because in our usecase, we just want to know when suspend has started or resume has finished. And so we really don't need a per cpu callback.
And so I felt probably it would be better to implement those instead of cpu_subsys callbacks.
- Or you want to extend only CPU subsystems notifiers? What notifiers exactly?
And at which places we want to issue them from?
Why do we need to use notifiers? What about PM callbacks?
Yeah, we don't need notifiers but callbacks.
Okay, so you were asking about extending following list: CPU_ONLINE, CPU_UP_PREPARE, CPU_UP_CANCELED, CPU_DOWN_PREPARE, etc.. to include suspend/resume ones as well?
No. Bus types (among other things) may provide suspend/resume callbacks for handling devices. We have a bus type for CPUs, which is called cpu_subsys and currently doesn't define any PM callbacks, although it could do that in principle. Have you investigated that possibility?
I did it now and got really confused. :)
This is what my understanding is: - bus can register PM hooks, like suspend/resume/prepare, etc.. - devices under that bus would register themselves to that bus and eventually can get their _driver's_ callbacks called via bus hooks.. For example and I2C controller driver's callbacks will get called via i2c core bus..
- In case of cpu subsystem, even if cpu_subsys adds those hooks in drivers/base/cpu.c, then those hooks will get called for each cpu. CPU's don't have a driver and so the only callbacks called are the ones of cpu_subsys. - How will we bind/use them with cpufreq?
Our sole requirement here is to get notify cpufreq core that system suspend/resume/hibernation/restore has started/finished. How will that get fulfilled with cpu_subsys callbacks?
Logically speaking, all existing ones does look correct as they are more or less cpu related. But suspend/resume doesn't look any similar, Atleast to me.
Suspend/resume are system's state rather than CPU's.. We aren't suspending or resuming CPUs, we are shutting them off.. So I thought maybe syscore ops is a better place (which is already used by cpufreq)..
cpufreq uses syscore_ops for the boot CPU only and that admittedly is a hack.
Why do you call it a hack?
syscore_ops is specifically for things that have to be suspended with only one CPU online and with interrupts off. I'm not sure how that applies to cpufreq.
Currently syscore_ops only implements suspend/resume/shutdown callbacks and those precisely happen the way you mentioned. i.e. after removing all non-boot CPUs and disabling interrupts (And before bringing back all CPUs and enabling interrupts on resume side).. So, yes we have limitation currently..
Honestly speaking I have looked at syscore ops for the first time now, when we got to this problem.. I couldn't find much information about it anywhere, leaving the commit itself: 40dc16
And by that, this is the definition of this framework: "PM / Core: Introduce struct syscore_ops for core subsystems PM"
I can see that you mentioned the limitations like single cpu and disabled interrupts even in the log, but I think we can enhance this framework a little bit.
Also I can see that there are many users of this framework which aren't core frameworks but simply drivers. I don't think that was the intention behind this framework, but that's how others went to use it.
So, this framework exists to service core frameworks for their requirements about PM stages. Currently that is only limited to late suspend and early resume but I feel there is space for more..
For example, our current problem.. A core framework wants to prepare before suspend starts and after everything has resumed. Obviously that would violate one of the basic rules with which this was designed, but still this feature lies in scope of syscore. And so we can keep the limitations as is for suspend/resume/shutdown but not for prepare and resume_late.
And I really feel even if we would be able to use cpu callbacks for suspend/resume, that would be a real *Hack*, because our framework doesn't want to get a callback for each of its devices (i.e. cpu) but a single callback at certain instances.. And syscore suits very well to this scenario..
-- viresh
On 11/18/2013 01:39 PM, viresh kumar wrote:
On 18 November 2013 03:07, Rafael J. Wysocki rjw@rjwysocki.net wrote:
On Sunday, November 17, 2013 10:27:43 PM Viresh Kumar wrote:
Okay.. Even these notifiers would be fine for me. To make things more clear before I start implementing them:
- What about implementing syscore's suspend_prepare and post_suspend?
I'm not sure how useful that would be. When would you like to execute those things?
Maybe after freezing userspace. So I was actually looking to move the existing code I wrote in PM notifiers to those..
Because in our usecase, we just want to know when suspend has started or resume has finished. And so we really don't need a per cpu callback.
And so I felt probably it would be better to implement those instead of cpu_subsys callbacks.
- Or you want to extend only CPU subsystems notifiers? What notifiers exactly?
And at which places we want to issue them from?
Why do we need to use notifiers? What about PM callbacks?
Yeah, we don't need notifiers but callbacks.
Okay, so you were asking about extending following list: CPU_ONLINE, CPU_UP_PREPARE, CPU_UP_CANCELED, CPU_DOWN_PREPARE, etc.. to include suspend/resume ones as well?
No. Bus types (among other things) may provide suspend/resume callbacks for handling devices. We have a bus type for CPUs, which is called cpu_subsys and currently doesn't define any PM callbacks, although it could do that in principle. Have you investigated that possibility?
I did it now and got really confused. :)
This is what my understanding is:
- bus can register PM hooks, like suspend/resume/prepare, etc..
- devices under that bus would register themselves to that bus and eventually
can get their _driver's_ callbacks called via bus hooks.. For example and I2C controller driver's callbacks will get called via i2c core bus..
- In case of cpu subsystem, even if cpu_subsys adds those hooks in
drivers/base/cpu.c, then those hooks will get called for each cpu. CPU's don't have a driver and so the only callbacks called are the ones of cpu_subsys.
- How will we bind/use them with cpufreq?
How about introducing a resume/suspend callback pointer or list(if there are several places that need to deal with cpu resume/suspend) in the struct cpu and populate it in the cpufreq_add_dev()?
The suspend/resume() of cpu_subsys needs to check the callback pointer and run it if available.
Our sole requirement here is to get notify cpufreq core that system suspend/resume/hibernation/restore has started/finished. How will that get fulfilled with cpu_subsys callbacks?
Logically speaking, all existing ones does look correct as they are more or less cpu related. But suspend/resume doesn't look any similar, Atleast to me.
Suspend/resume are system's state rather than CPU's.. We aren't suspending or resuming CPUs, we are shutting them off.. So I thought maybe syscore ops is a better place (which is already used by cpufreq)..
cpufreq uses syscore_ops for the boot CPU only and that admittedly is a hack.
Why do you call it a hack?
syscore_ops is specifically for things that have to be suspended with only one CPU online and with interrupts off. I'm not sure how that applies to cpufreq.
Currently syscore_ops only implements suspend/resume/shutdown callbacks and those precisely happen the way you mentioned. i.e. after removing all non-boot CPUs and disabling interrupts (And before bringing back all CPUs and enabling interrupts on resume side).. So, yes we have limitation currently..
Honestly speaking I have looked at syscore ops for the first time now, when we got to this problem.. I couldn't find much information about it anywhere, leaving the commit itself: 40dc16
And by that, this is the definition of this framework: "PM / Core: Introduce struct syscore_ops for core subsystems PM"
I can see that you mentioned the limitations like single cpu and disabled interrupts even in the log, but I think we can enhance this framework a little bit.
Also I can see that there are many users of this framework which aren't core frameworks but simply drivers. I don't think that was the intention behind this framework, but that's how others went to use it.
So, this framework exists to service core frameworks for their requirements about PM stages. Currently that is only limited to late suspend and early resume but I feel there is space for more..
For example, our current problem.. A core framework wants to prepare before suspend starts and after everything has resumed. Obviously that would violate one of the basic rules with which this was designed, but still this feature lies in scope of syscore. And so we can keep the limitations as is for suspend/resume/shutdown but not for prepare and resume_late.
And I really feel even if we would be able to use cpu callbacks for suspend/resume, that would be a real *Hack*, because our framework doesn't want to get a callback for each of its devices (i.e. cpu) but a single callback at certain instances.. And syscore suits very well to this scenario..
-- viresh
On 18 November 2013 16:27, Lan Tianyu tianyu.lan@intel.com wrote:
How about introducing a resume/suspend callback pointer or list(if there are several places that need to deal with cpu resume/suspend) in the struct cpu and populate it in the cpufreq_add_dev()?
The suspend/resume() of cpu_subsys needs to check the callback pointer and run it if available.
That's almost a new infrastructure then and looks more hackish :) Apart from that even cpufreq would be a bit hacky as we don't really need per-cpu callbacks..
On 11/18/2013 07:01 PM, Viresh Kumar wrote:
On 18 November 2013 16:27, Lan Tianyu tianyu.lan@intel.com wrote:
How about introducing a resume/suspend callback pointer or list(if there are several places that need to deal with cpu resume/suspend) in the struct cpu and populate it in the cpufreq_add_dev()?
The suspend/resume() of cpu_subsys needs to check the callback pointer and run it if available.
That's almost a new infrastructure then and looks more hackish :)
The resume/suspend() must be stored in the struct driver->pm? :)
Apart from that even cpufreq would be a bit hacky as we don't really need per-cpu callbacks..
This maybe depends on where we want the issue to be fixed, right? The cpufreq driver also can fix the issue if we run their cpu_driver resume/suspend callback earlier.
Another point, I just see cpuidle_resume() and cpuidle_pause() are called in the dpm_resume_noirq and dpm_suspend_noirq(). Not sure whether this can be applied to cpufreq.
On 18 November 2013 19:07, Lan Tianyu tianyu.lan@intel.com wrote:
The resume/suspend() must be stored in the struct driver->pm? :)
We certainly can't move back to increase redundancy by implementing driver's specific stuff here :)
Apart from that even cpufreq would be a bit hacky as we don't really need per-cpu callbacks..
This maybe depends on where we want the issue to be fixed, right? The cpufreq driver also can fix the issue if we run their cpu_driver resume/suspend callback earlier.
same as above..
Another point, I just see cpuidle_resume() and cpuidle_pause() are called in the dpm_resume_noirq and dpm_suspend_noirq(). Not sure whether this can be applied to cpufreq.
I will still prefer syscore_ops instead of calling framework specific routines directly from dpm_**() routines.. Don't know why this was done this way for cpuidle..
On Monday, November 18, 2013 09:37:39 PM Lan Tianyu wrote:
On 11/18/2013 07:01 PM, Viresh Kumar wrote:
On 18 November 2013 16:27, Lan Tianyu tianyu.lan@intel.com wrote:
How about introducing a resume/suspend callback pointer or list(if there are several places that need to deal with cpu resume/suspend) in the struct cpu and populate it in the cpufreq_add_dev()?
The suspend/resume() of cpu_subsys needs to check the callback pointer and run it if available.
That's almost a new infrastructure then and looks more hackish :)
The resume/suspend() must be stored in the struct driver->pm? :)
Apart from that even cpufreq would be a bit hacky as we don't really need per-cpu callbacks..
This maybe depends on where we want the issue to be fixed, right? The cpufreq driver also can fix the issue if we run their cpu_driver resume/suspend callback earlier.
Another point, I just see cpuidle_resume() and cpuidle_pause() are called in the dpm_resume_noirq and dpm_suspend_noirq(). Not sure whether this can be applied to cpufreq.
I don't see why not.
Thanks!
On 18 November 2013 11:09, viresh kumar viresh.kumar@linaro.org wrote:
On 18 November 2013 03:07, Rafael J. Wysocki rjw@rjwysocki.net wrote:
On Sunday, November 17, 2013 10:27:43 PM Viresh Kumar wrote:
Okay.. Even these notifiers would be fine for me. To make things more clear before I start implementing them:
- What about implementing syscore's suspend_prepare and post_suspend?
I'm not sure how useful that would be. When would you like to execute those things?
Maybe after freezing userspace. So I was actually looking to move the existing code I wrote in PM notifiers to those..
Because in our usecase, we just want to know when suspend has started or resume has finished. And so we really don't need a per cpu callback.
And so I felt probably it would be better to implement those instead of cpu_subsys callbacks.
- Or you want to extend only CPU subsystems notifiers? What notifiers exactly?
And at which places we want to issue them from?
Why do we need to use notifiers? What about PM callbacks?
Yeah, we don't need notifiers but callbacks.
Okay, so you were asking about extending following list: CPU_ONLINE, CPU_UP_PREPARE, CPU_UP_CANCELED, CPU_DOWN_PREPARE, etc.. to include suspend/resume ones as well?
No. Bus types (among other things) may provide suspend/resume callbacks for handling devices. We have a bus type for CPUs, which is called cpu_subsys and currently doesn't define any PM callbacks, although it could do that in principle. Have you investigated that possibility?
I did it now and got really confused. :)
This is what my understanding is:
- bus can register PM hooks, like suspend/resume/prepare, etc..
- devices under that bus would register themselves to that bus and eventually
can get their _driver's_ callbacks called via bus hooks.. For example and I2C controller driver's callbacks will get called via i2c core bus..
- In case of cpu subsystem, even if cpu_subsys adds those hooks in
drivers/base/cpu.c, then those hooks will get called for each cpu. CPU's don't have a driver and so the only callbacks called are the ones of cpu_subsys.
- How will we bind/use them with cpufreq?
Our sole requirement here is to get notify cpufreq core that system suspend/resume/hibernation/restore has started/finished. How will that get fulfilled with cpu_subsys callbacks?
Logically speaking, all existing ones does look correct as they are more or less cpu related. But suspend/resume doesn't look any similar, Atleast to me.
Suspend/resume are system's state rather than CPU's.. We aren't suspending or resuming CPUs, we are shutting them off.. So I thought maybe syscore ops is a better place (which is already used by cpufreq)..
cpufreq uses syscore_ops for the boot CPU only and that admittedly is a hack.
Why do you call it a hack?
syscore_ops is specifically for things that have to be suspended with only one CPU online and with interrupts off. I'm not sure how that applies to cpufreq.
Currently syscore_ops only implements suspend/resume/shutdown callbacks and those precisely happen the way you mentioned. i.e. after removing all non-boot CPUs and disabling interrupts (And before bringing back all CPUs and enabling interrupts on resume side).. So, yes we have limitation currently..
Honestly speaking I have looked at syscore ops for the first time now, when we got to this problem.. I couldn't find much information about it anywhere, leaving the commit itself: 40dc16
And by that, this is the definition of this framework: "PM / Core: Introduce struct syscore_ops for core subsystems PM"
I can see that you mentioned the limitations like single cpu and disabled interrupts even in the log, but I think we can enhance this framework a little bit.
Also I can see that there are many users of this framework which aren't core frameworks but simply drivers. I don't think that was the intention behind this framework, but that's how others went to use it.
So, this framework exists to service core frameworks for their requirements about PM stages. Currently that is only limited to late suspend and early resume but I feel there is space for more..
For example, our current problem.. A core framework wants to prepare before suspend starts and after everything has resumed. Obviously that would violate one of the basic rules with which this was designed, but still this feature lies in scope of syscore. And so we can keep the limitations as is for suspend/resume/shutdown but not for prepare and resume_late.
And I really feel even if we would be able to use cpu callbacks for suspend/resume, that would be a real *Hack*, because our framework doesn't want to get a callback for each of its devices (i.e. cpu) but a single callback at certain instances.. And syscore suits very well to this scenario..
Hi Rafael,
I need few more suggestions from you on this :)
On Wednesday, November 20, 2013 11:04:28 AM Viresh Kumar wrote:
On 18 November 2013 11:09, viresh kumar viresh.kumar@linaro.org wrote:
On 18 November 2013 03:07, Rafael J. Wysocki rjw@rjwysocki.net wrote:
On Sunday, November 17, 2013 10:27:43 PM Viresh Kumar wrote:
Okay.. Even these notifiers would be fine for me. To make things more clear before I start implementing them:
- What about implementing syscore's suspend_prepare and post_suspend?
I'm not sure how useful that would be. When would you like to execute those things?
Maybe after freezing userspace. So I was actually looking to move the existing code I wrote in PM notifiers to those..
Because in our usecase, we just want to know when suspend has started or resume has finished. And so we really don't need a per cpu callback.
And so I felt probably it would be better to implement those instead of cpu_subsys callbacks.
- Or you want to extend only CPU subsystems notifiers? What notifiers exactly?
And at which places we want to issue them from?
Why do we need to use notifiers? What about PM callbacks?
Yeah, we don't need notifiers but callbacks.
Okay, so you were asking about extending following list: CPU_ONLINE, CPU_UP_PREPARE, CPU_UP_CANCELED, CPU_DOWN_PREPARE, etc.. to include suspend/resume ones as well?
No. Bus types (among other things) may provide suspend/resume callbacks for handling devices. We have a bus type for CPUs, which is called cpu_subsys and currently doesn't define any PM callbacks, although it could do that in principle. Have you investigated that possibility?
I did it now and got really confused. :)
This is what my understanding is:
- bus can register PM hooks, like suspend/resume/prepare, etc..
- devices under that bus would register themselves to that bus and eventually
can get their _driver's_ callbacks called via bus hooks.. For example and I2C controller driver's callbacks will get called via i2c core bus..
- In case of cpu subsystem, even if cpu_subsys adds those hooks in
drivers/base/cpu.c, then those hooks will get called for each cpu. CPU's don't have a driver and so the only callbacks called are the ones of cpu_subsys.
- How will we bind/use them with cpufreq?
Our sole requirement here is to get notify cpufreq core that system suspend/resume/hibernation/restore has started/finished. How will that get fulfilled with cpu_subsys callbacks?
Logically speaking, all existing ones does look correct as they are more or less cpu related. But suspend/resume doesn't look any similar, Atleast to me.
Suspend/resume are system's state rather than CPU's.. We aren't suspending or resuming CPUs, we are shutting them off.. So I thought maybe syscore ops is a better place (which is already used by cpufreq)..
cpufreq uses syscore_ops for the boot CPU only and that admittedly is a hack.
Why do you call it a hack?
syscore_ops is specifically for things that have to be suspended with only one CPU online and with interrupts off. I'm not sure how that applies to cpufreq.
Currently syscore_ops only implements suspend/resume/shutdown callbacks and those precisely happen the way you mentioned. i.e. after removing all non-boot CPUs and disabling interrupts (And before bringing back all CPUs and enabling interrupts on resume side).. So, yes we have limitation currently..
Honestly speaking I have looked at syscore ops for the first time now, when we got to this problem.. I couldn't find much information about it anywhere, leaving the commit itself: 40dc16
And by that, this is the definition of this framework: "PM / Core: Introduce struct syscore_ops for core subsystems PM"
I can see that you mentioned the limitations like single cpu and disabled interrupts even in the log, but I think we can enhance this framework a little bit.
Also I can see that there are many users of this framework which aren't core frameworks but simply drivers. I don't think that was the intention behind this framework, but that's how others went to use it.
So, this framework exists to service core frameworks for their requirements about PM stages. Currently that is only limited to late suspend and early resume but I feel there is space for more..
For example, our current problem.. A core framework wants to prepare before suspend starts and after everything has resumed. Obviously that would violate one of the basic rules with which this was designed, but still this feature lies in scope of syscore. And so we can keep the limitations as is for suspend/resume/shutdown but not for prepare and resume_late.
And I really feel even if we would be able to use cpu callbacks for suspend/resume, that would be a real *Hack*, because our framework doesn't want to get a callback for each of its devices (i.e. cpu) but a single callback at certain instances.. And syscore suits very well to this scenario..
Hi Rafael,
I need few more suggestions from you on this :)
And I need some more time to think about that.
Thanks!
On Wednesday, November 20, 2013 11:04:28 AM Viresh Kumar wrote:
On 18 November 2013 11:09, viresh kumar viresh.kumar@linaro.org wrote:
On 18 November 2013 03:07, Rafael J. Wysocki rjw@rjwysocki.net wrote:
On Sunday, November 17, 2013 10:27:43 PM Viresh Kumar wrote:
Okay.. Even these notifiers would be fine for me. To make things more clear before I start implementing them:
- What about implementing syscore's suspend_prepare and post_suspend?
I'm not sure how useful that would be. When would you like to execute those things?
Maybe after freezing userspace. So I was actually looking to move the existing code I wrote in PM notifiers to those..
Because in our usecase, we just want to know when suspend has started or resume has finished. And so we really don't need a per cpu callback.
But it won't hurt I suppose?
And so I felt probably it would be better to implement those instead of cpu_subsys callbacks.
- Or you want to extend only CPU subsystems notifiers? What notifiers exactly?
And at which places we want to issue them from?
Why do we need to use notifiers? What about PM callbacks?
Yeah, we don't need notifiers but callbacks.
Okay, so you were asking about extending following list: CPU_ONLINE, CPU_UP_PREPARE, CPU_UP_CANCELED, CPU_DOWN_PREPARE, etc.. to include suspend/resume ones as well?
No. Bus types (among other things) may provide suspend/resume callbacks for handling devices. We have a bus type for CPUs, which is called cpu_subsys and currently doesn't define any PM callbacks, although it could do that in principle. Have you investigated that possibility?
I did it now and got really confused. :)
This is what my understanding is:
- bus can register PM hooks, like suspend/resume/prepare, etc..
- devices under that bus would register themselves to that bus and eventually
can get their _driver's_ callbacks called via bus hooks.. For example and I2C controller driver's callbacks will get called via i2c core bus..
- In case of cpu subsystem, even if cpu_subsys adds those hooks in
drivers/base/cpu.c, then those hooks will get called for each cpu. CPU's don't have a driver
That actually isn't correct. On systems with ACPI the processor driver binds to those devices. So the processor driver could use PM callbacks on those systems in principle.
and so the only callbacks called are the ones of cpu_subsys.
- How will we bind/use them with cpufreq?
Our sole requirement here is to get notify cpufreq core that system suspend/resume/hibernation/restore has started/finished. How will that get fulfilled with cpu_subsys callbacks?
Introduce proper drivers for processors? All of the cpuidle and cpufreq stuff currently works by using its own homegrown device registration methods etc, but surely that doesn't have to be the case?
Logically speaking, all existing ones does look correct as they are more or less cpu related. But suspend/resume doesn't look any similar, Atleast to me.
Suspend/resume are system's state rather than CPU's.. We aren't suspending or resuming CPUs, we are shutting them off.. So I thought maybe syscore ops is a better place (which is already used by cpufreq)..
cpufreq uses syscore_ops for the boot CPU only and that admittedly is a hack.
Why do you call it a hack?
syscore_ops is specifically for things that have to be suspended with only one CPU online and with interrupts off. I'm not sure how that applies to cpufreq.
Currently syscore_ops only implements suspend/resume/shutdown callbacks and those precisely happen the way you mentioned.
Which is very much intentional.
i.e. after removing all non-boot CPUs and disabling interrupts (And before bringing back all CPUs and enabling interrupts on resume side).. So, yes we have limitation currently..
That is a by-design limitation, mind you.
Honestly speaking I have looked at syscore ops for the first time now, when we got to this problem.. I couldn't find much information about it anywhere, leaving the commit itself: 40dc16
And by that, this is the definition of this framework: "PM / Core: Introduce struct syscore_ops for core subsystems PM"
That also is intentional, because it is very special. It really only is for stuff that has to be run after putting non-boot CPUs offline.
I can see that you mentioned the limitations like single cpu and disabled interrupts even in the log, but I think we can enhance this framework a little bit.
No.
Also I can see that there are many users of this framework which aren't core frameworks but simply drivers. I don't think that was the intention behind this framework, but that's how others went to use it.
That's not my fault.
So, this framework exists to service core frameworks for their requirements about PM stages. Currently that is only limited to late suspend and early resume but I feel there is space for more..
For example, our current problem.. A core framework wants to prepare before suspend starts and after everything has resumed. Obviously that would violate one of the basic rules with which this was designed, but still this feature lies in scope of syscore. And so we can keep the limitations as is for suspend/resume/shutdown but not for prepare and resume_late.
And I really feel even if we would be able to use cpu callbacks for suspend/resume, that would be a real *Hack*, because our framework doesn't want to get a callback for each of its devices (i.e. cpu) but a single callback at certain instances..
Oh really? So CPUs are not individual devices any more or what?
Rafael
On 21 November 2013 20:08, Rafael J. Wysocki rjw@rjwysocki.net wrote:
On Wednesday, November 20, 2013 11:04:28 AM Viresh Kumar wrote:
On 18 November 2013 11:09, viresh kumar viresh.kumar@linaro.org wrote:
Because in our usecase, we just want to know when suspend has started or resume has finished. And so we really don't need a per cpu callback.
But it won't hurt I suppose?
Hmm.. getting a single call to cpufreq core would be faster for sure. Otherwise we need to mark all the calls leaving the first one as no-ops..
That actually isn't correct. On systems with ACPI the processor driver binds to those devices. So the processor driver could use PM callbacks on those systems in principle.
Introduce proper drivers for processors? All of the cpuidle and cpufreq stuff currently works by using its own homegrown device registration methods etc, but surely that doesn't have to be the case?
Hmm.. So you are asking for a new cpu-driver which can be used by cpufreq and cpuidle to get callback? If yes, where such driver will exist? And will the ACPI processor-drivers exist parallely? Or something else?
And I really feel even if we would be able to use cpu callbacks for suspend/resume, that would be a real *Hack*, because our framework doesn't want to get a callback for each of its devices (i.e. cpu) but a single callback at certain instances..
Oh really? So CPUs are not individual devices any more or what?
I am not calling cpu callbacks as hack but using them for cpufreq looked like one to me.
Replying here to the other mail as well:
On 21 November 2013 20:09, Rafael J. Wysocki rjw@rjwysocki.net wrote:
On Monday, November 18, 2013 09:37:39 PM Lan Tianyu wrote:
Another point, I just see cpuidle_resume() and cpuidle_pause() are called in the dpm_resume_noirq and dpm_suspend_noirq(). Not sure whether this can be applied to cpufreq.
I don't see why not.
Interesting. So you would be happy if I add such calls after freezing userspace and before restoring it back for cpufreq?
On Thursday, November 21, 2013 09:47:20 PM Viresh Kumar wrote:
On 21 November 2013 20:08, Rafael J. Wysocki rjw@rjwysocki.net wrote:
On Wednesday, November 20, 2013 11:04:28 AM Viresh Kumar wrote:
On 18 November 2013 11:09, viresh kumar viresh.kumar@linaro.org wrote:
[...]
Replying here to the other mail as well:
On 21 November 2013 20:09, Rafael J. Wysocki rjw@rjwysocki.net wrote:
On Monday, November 18, 2013 09:37:39 PM Lan Tianyu wrote:
Another point, I just see cpuidle_resume() and cpuidle_pause() are called in the dpm_resume_noirq and dpm_suspend_noirq(). Not sure whether this can be applied to cpufreq.
I don't see why not.
Interesting. So you would be happy if I add such calls after freezing userspace and before restoring it back for cpufreq?
Short-term. To be precise, governors may be stopped at the beginning of dpm_suspend_noirq() (that is, where cpuidle_pause() is called). Analogously, they may be started again in dpm_resume_noirq(), where cpuidle_resume() is called. That at least would be consistent with what cpuidle already does.
That said in my opinion the appropriate long-term approach would be to split CPU offline and online each into two parts, the "core" part and the "extras" part, such that the "core" parts would only do the offline/online of the cores themselves. The rest, such as cpufreq/cpuidle "offline/online" would be done in the "extras" part.
Then, system suspend/resume will only use the "core" parts of CPU offline/online and the handling of the things belonging to "extras" would be carried out through CPU device suspend/resume callbacks. In turn, the "runtime" CPU offline and online would carry out both the "extras" and "core" parts as it does today.
Makes sense?
Rafael
On Friday 22 November 2013 03:44 AM, Rafael J. Wysocki wrote:
Short-term. To be precise, governors may be stopped at the beginning of dpm_suspend_noirq() (that is, where cpuidle_pause() is called). Analogously, they may be started again in dpm_resume_noirq(), where cpuidle_resume() is called. That at least would be consistent with what cpuidle already does.
Ahh, I mentioned the location to be after "freeze" as I thought CPUs are removed before calling dpm_suspend_noirq(). And yes I was *wrong*..
So, dpm_suspend_noirq() and resume_noirq() looks to be the right place to get that stuff in.. And that will fit cleanly in the existing code as well.. Not many changes would be required in the $subject patch..
That said in my opinion the appropriate long-term approach would be to split CPU offline and online each into two parts, the "core" part and the "extras" part, such that the "core" parts would only do the offline/online of the cores themselves. The rest, such as cpufreq/cpuidle "offline/online" would be done in the "extras" part.
Then, system suspend/resume will only use the "core" parts of CPU offline/online and the handling of the things belonging to "extras" would be carried out through CPU device suspend/resume callbacks. In turn, the "runtime" CPU offline and online would carry out both the "extras" and "core" parts as it does today.
Makes sense?
Yes it does. Very much.
So, I will probably float a initial patch with the dpm_{suspend|resume}_noirq() approach to get things fixed for now. And then will do what you suggested. And yes logically this makes sense, a lot of sense. cpuidle/freq are about managing CPUs and so we better have a CPU driver here, to take care of suspend/resume paths.
I have few questions regarding the long term solution. There can be only one driver for any device, this is how device-driver model is. But there can be many users of cpu driver. Like ACPI (which we already have: drivers/acpi/processor_driver.c), CPUFreq, CPUIdle and maybe more..
To get all these serviced together we probably need to write another layer on top of these to which these will register their callbacks.
Then I started looking into kernel code to understand different frameworks we are using and came across: subsys_interface. This is the comment over it:
* Simple interfaces attached to a subsystem. Multiple interfaces can * attach to a subsystem and its devices. Unlike drivers, they do not * exclusively claim or control devices. Interfaces usually represent * a specific functionality of a subsystem/class of devices.
And it exactly fits our purpose. We don't really need a CPU driver as there are multiple frameworks that need it for the same device. And probably we just need a interface which would call user specific callbacks (user being: cpufreq, cpuidle, maybe more)..
So, what about something like this ?
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index f48370d..523c0bc 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -120,6 +120,45 @@ static DEVICE_ATTR(release, S_IWUSR, NULL, cpu_release_store); #endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ #endif /* CONFIG_HOTPLUG_CPU */
+int cpu_subsys_suspend_noirq(struct device *dev) +{ + struct bus_type *bus = dev->bus; + struct subsys_interface *sif; + int ret = 0; + + list_for_each_entry(sif, &bus->p->interfaces, node) { + if (sif->pm && sif->pm->suspend_noirq) { + ret = sif->suspend_noirq(dev); + if (ret) + break; + } + } + + return ret; +} + +int cpu_subsys_resume_noirq(struct device *dev) +{ + struct bus_type *bus = dev->bus; + struct subsys_interface *sif; + int ret = 0; + + list_for_each_entry(sif, &bus->p->interfaces, node) { + if (sif->pm && sif->pm->resume_noirq) { + ret = sif->resume_noirq(dev); + if (ret) + break; + } + } + + return ret; +} + +static const struct dev_pm_ops cpu_subsys_pm_ops = { + .suspend_noirq = cpu_subsys_suspend_noirq, + .resume_noirq = cpu_subsys_resume_noirq, +}; + struct bus_type cpu_subsys = { .name = "cpu", .dev_name = "cpu", @@ -128,6 +167,7 @@ struct bus_type cpu_subsys = { .online = cpu_subsys_online, .offline = cpu_subsys_offline, #endif + .pm = &cpu_subsys_pm_ops, }; EXPORT_SYMBOL_GPL(cpu_subsys);
diff --git a/include/linux/device.h b/include/linux/device.h index b025925..fa01273 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -298,11 +298,16 @@ struct device *driver_find_device(struct device_driver *drv, * @node: the list of functions registered at the subsystem * @add_dev: device hookup to device function handler * @remove_dev: device hookup to device function handler + * @pm: Power management operations of this interface. * * Simple interfaces attached to a subsystem. Multiple interfaces can * attach to a subsystem and its devices. Unlike drivers, they do not * exclusively claim or control devices. Interfaces usually represent * a specific functionality of a subsystem/class of devices. + * + * PM callbacks are called from individual subsystems instead of PM core. And + * hence might not be available for all subsystems. Currently present for: + * cpu_subsys. */ struct subsys_interface { const char *name; @@ -310,6 +315,7 @@ struct subsys_interface { struct list_head node; int (*add_dev)(struct device *dev, struct subsys_interface *sif); int (*remove_dev)(struct device *dev, struct subsys_interface *sif); + const struct dev_pm_ops *pm; };
int subsys_interface_register(struct subsys_interface *sif);
On 22 November 2013 14:41, viresh kumar viresh.kumar@linaro.org wrote:
So, what about something like this ?
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index f48370d..523c0bc 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -120,6 +120,45 @@ static DEVICE_ATTR(release, S_IWUSR, NULL, cpu_release_store); #endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ #endif /* CONFIG_HOTPLUG_CPU */
+int cpu_subsys_suspend_noirq(struct device *dev) +{
struct bus_type *bus = dev->bus;
struct subsys_interface *sif;
int ret = 0;
list_for_each_entry(sif, &bus->p->interfaces, node) {
if (sif->pm && sif->pm->suspend_noirq) {
ret = sif->suspend_noirq(dev);
if (ret)
break;
}
}
return ret;
+}
+int cpu_subsys_resume_noirq(struct device *dev) +{
struct bus_type *bus = dev->bus;
struct subsys_interface *sif;
int ret = 0;
list_for_each_entry(sif, &bus->p->interfaces, node) {
if (sif->pm && sif->pm->resume_noirq) {
ret = sif->resume_noirq(dev);
if (ret)
break;
}
}
return ret;
+}
+static const struct dev_pm_ops cpu_subsys_pm_ops = {
.suspend_noirq = cpu_subsys_suspend_noirq,
.resume_noirq = cpu_subsys_resume_noirq,
+};
struct bus_type cpu_subsys = { .name = "cpu", .dev_name = "cpu", @@ -128,6 +167,7 @@ struct bus_type cpu_subsys = { .online = cpu_subsys_online, .offline = cpu_subsys_offline, #endif
.pm = &cpu_subsys_pm_ops,
}; EXPORT_SYMBOL_GPL(cpu_subsys);
diff --git a/include/linux/device.h b/include/linux/device.h index b025925..fa01273 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -298,11 +298,16 @@ struct device *driver_find_device(struct device_driver *drv,
- @node: the list of functions registered at the subsystem
- @add_dev: device hookup to device function handler
- @remove_dev: device hookup to device function handler
- @pm: Power management operations of this interface.
- Simple interfaces attached to a subsystem. Multiple interfaces can
- attach to a subsystem and its devices. Unlike drivers, they do not
- exclusively claim or control devices. Interfaces usually represent
- a specific functionality of a subsystem/class of devices.
- PM callbacks are called from individual subsystems instead of PM core. And
- hence might not be available for all subsystems. Currently present for:
*/
- cpu_subsys.
struct subsys_interface { const char *name; @@ -310,6 +315,7 @@ struct subsys_interface { struct list_head node; int (*add_dev)(struct device *dev, struct subsys_interface *sif); int (*remove_dev)(struct device *dev, struct subsys_interface *sif);
const struct dev_pm_ops *pm;
};
int subsys_interface_register(struct subsys_interface *sif);
Any inputs?
On Monday, November 25, 2013 09:55:19 AM Viresh Kumar wrote:
On 22 November 2013 14:41, viresh kumar viresh.kumar@linaro.org wrote:
So, what about something like this ?
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index f48370d..523c0bc 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -120,6 +120,45 @@ static DEVICE_ATTR(release, S_IWUSR, NULL, cpu_release_store); #endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ #endif /* CONFIG_HOTPLUG_CPU */
+int cpu_subsys_suspend_noirq(struct device *dev) +{
struct bus_type *bus = dev->bus;
struct subsys_interface *sif;
int ret = 0;
list_for_each_entry(sif, &bus->p->interfaces, node) {
if (sif->pm && sif->pm->suspend_noirq) {
ret = sif->suspend_noirq(dev);
if (ret)
break;
}
}
return ret;
+}
+int cpu_subsys_resume_noirq(struct device *dev) +{
struct bus_type *bus = dev->bus;
struct subsys_interface *sif;
int ret = 0;
list_for_each_entry(sif, &bus->p->interfaces, node) {
if (sif->pm && sif->pm->resume_noirq) {
ret = sif->resume_noirq(dev);
if (ret)
break;
}
}
return ret;
+}
+static const struct dev_pm_ops cpu_subsys_pm_ops = {
.suspend_noirq = cpu_subsys_suspend_noirq,
.resume_noirq = cpu_subsys_resume_noirq,
+};
struct bus_type cpu_subsys = { .name = "cpu", .dev_name = "cpu", @@ -128,6 +167,7 @@ struct bus_type cpu_subsys = { .online = cpu_subsys_online, .offline = cpu_subsys_offline, #endif
.pm = &cpu_subsys_pm_ops,
}; EXPORT_SYMBOL_GPL(cpu_subsys);
diff --git a/include/linux/device.h b/include/linux/device.h index b025925..fa01273 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -298,11 +298,16 @@ struct device *driver_find_device(struct device_driver *drv,
- @node: the list of functions registered at the subsystem
- @add_dev: device hookup to device function handler
- @remove_dev: device hookup to device function handler
- @pm: Power management operations of this interface.
- Simple interfaces attached to a subsystem. Multiple interfaces can
- attach to a subsystem and its devices. Unlike drivers, they do not
- exclusively claim or control devices. Interfaces usually represent
- a specific functionality of a subsystem/class of devices.
- PM callbacks are called from individual subsystems instead of PM core. And
- hence might not be available for all subsystems. Currently present for:
*/
- cpu_subsys.
struct subsys_interface { const char *name; @@ -310,6 +315,7 @@ struct subsys_interface { struct list_head node; int (*add_dev)(struct device *dev, struct subsys_interface *sif); int (*remove_dev)(struct device *dev, struct subsys_interface *sif);
const struct dev_pm_ops *pm;
};
int subsys_interface_register(struct subsys_interface *sif);
Any inputs?
Please give me some more time, this suretly is not urgent?
Hi!
Well, disabling it for the whole duration of suspend/resume and/or hibernation may not be the right approach entirely, unless we force the pax perf of the
s/pax/max ?
Yes.
I'm not sure max frequency is a good idea. In particular, early athlon64 notebooks could not run on max frequency on battery power.
IMO it would be good to keep governors running during hibernation.
Pavel
On Wednesday, December 25, 2013 11:39:07 PM Pavel Machek wrote:
Hi!
Well, disabling it for the whole duration of suspend/resume and/or hibernation may not be the right approach entirely, unless we force the pax perf of the
s/pax/max ?
Yes.
I'm not sure max frequency is a good idea. In particular, early athlon64 notebooks could not run on max frequency on battery power.
IMO it would be good to keep governors running during hibernation.
The idea of using PM notifiers for cpufreq suspend/resume has been abandoned already.
Thanks, Rafael
On 11/15/2013 06:12 PM, Viresh Kumar wrote:
This patch adds PM notifiers for handling suspend/resume of cpufreq governors. This is required for early suspend and late resume of governors.
There are multiple reasons that support this patch:
- Firstly it looks very much logical to stop governors when we know we are going into suspend. But the question is when? Is PM notifiers the right place? Following reasons are the supporting hands for this decision.
- Nishanth Menon (TI) found an interesting problem on his platform, OMAP. His board wasn't working well with suspend/resume as calls for removing non-boot CPUs was turning out into a call to drivers ->target() which then tries to play with regulators. But regulators and their I2C bus were already suspended and this resulted in a failure. This is why we need a PM notifier here.
- Lan Tianyu (Intel) & Jinhyuk Choi (Broadcom) found another issue where tunables configuration for clusters/sockets with non-boot CPUs was getting lost after suspend/resume, as we were notifying governors with CPUFREQ_GOV_POLICY_EXIT on removal of the last cpu for that policy and so deallocating memory for tunables.
All above problems get fixed with having a PM notifier in place which will stop any operation on governor. Hence no need to do any special handling of variables like (frozen) in suspend/resume paths.
Reported-by: Lan Tianyu tianyu.lan@intel.com Reported-by: Nishanth Menon nm@ti.com Reported-by: Jinhyuk Choi jinchoi@broadcom.com Signed-off-by: Viresh Kumar viresh.kumar@linaro.org
Hi Guys,
Can you please verify if this fixes issues reported by you? I have tested this for multiple suspend-resumes on my thinkpad. It doesn't crash :)
This patch fixs my issue.
Tested-by: Lan Tianyu tianyu.lan@intel.com
drivers/cpufreq/cpufreq.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+)
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 02d534d..c87ced9 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -26,6 +26,7 @@ #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> +#include <linux/suspend.h> #include <linux/syscore_ops.h> #include <linux/tick.h> #include <trace/events/power.h> @@ -47,6 +48,9 @@ static LIST_HEAD(cpufreq_policy_list); static DEFINE_PER_CPU(char[CPUFREQ_NAME_LEN], cpufreq_cpu_governor); #endif
+/* Flag to suspend/resume CPUFreq governors */ +static bool cpufreq_suspended;
- static inline bool has_target(void) { return cpufreq_driver->target_index || cpufreq_driver->target;
@@ -1462,6 +1466,54 @@ static struct subsys_interface cpufreq_interface = { .remove_dev = cpufreq_remove_dev, };
+/*
- PM Notifier for suspending governors as some platforms can't change frequency
- after this point in suspend cycle. Because some of the devices (like: i2c,
- regulators, etc) they use for changing frequency are suspended quickly after
- this point.
- */
+static int cpufreq_pm_notify(struct notifier_block *nb, unsigned long action,
void *data)
+{
- struct cpufreq_policy *policy;
- unsigned long flags;
- if (!has_target())
return NOTIFY_OK;
- if (action == PM_SUSPEND_PREPARE) {
pr_debug("%s: Suspending Governors\n", __func__);
list_for_each_entry(policy, &cpufreq_policy_list, policy_list)
if (__cpufreq_governor(policy, CPUFREQ_GOV_STOP))
pr_err("%s: Failed to stop governor for policy: %p\n",
__func__, policy);
write_lock_irqsave(&cpufreq_driver_lock, flags);
cpufreq_suspended = true;
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
- } else if (action == PM_POST_SUSPEND) {
pr_debug("%s: Resuming Governors\n", __func__);
write_lock_irqsave(&cpufreq_driver_lock, flags);
cpufreq_suspended = false;
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
list_for_each_entry(policy, &cpufreq_policy_list, policy_list)
if (__cpufreq_governor(policy, CPUFREQ_GOV_START) ||
__cpufreq_governor(policy,
CPUFREQ_GOV_LIMITS))
pr_err("%s: Failed to start governor for policy: %p\n",
__func__, policy);
- }
- return NOTIFY_OK;
+}
+static struct notifier_block cpufreq_pm_notifier = {
- .notifier_call = cpufreq_pm_notify,
+};
- /**
- cpufreq_bp_suspend - Prepare the boot CPU for system suspend.
@@ -1752,6 +1804,8 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_target); static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event) {
unsigned long flags;
bool is_suspended; int ret;
/* Only must be defined when default governor is known to have latency
@@ -1764,6 +1818,14 @@ static int __cpufreq_governor(struct cpufreq_policy *policy, struct cpufreq_governor *gov = NULL; #endif
- /* Don't start any governor operations if we are entering suspend */
- read_lock_irqsave(&cpufreq_driver_lock, flags);
- is_suspended = cpufreq_suspended;
- read_unlock_irqrestore(&cpufreq_driver_lock, flags);
- if (is_suspended)
return 0;
- if (policy->governor->max_transition_latency && policy->cpuinfo.transition_latency > policy->governor->max_transition_latency) {
@@ -2222,6 +2284,7 @@ static int __init cpufreq_core_init(void) cpufreq_global_kobject = kobject_create(); BUG_ON(!cpufreq_global_kobject); register_syscore_ops(&cpufreq_syscore_ops);
register_pm_notifier(&cpufreq_pm_notifier);
return 0; }
linaro-kernel@lists.linaro.org