From: Shen Yong b00984@freescale.com
In order for PowerTOP to be able to report how well the new runtime PM is working for the various drivers, the kernel needs to export some basic statistics in sysfs.
This patch adds two sysfs files in the runtime PM domain that expose the total time a device has been active, and the time a device has been suspended.
With this PowerTOP can compute the activity percentage
Active %age = 100 * (delta active) / (delta active + delta suspended)
and present the information to the user.
I've written the PowerTOP code (slated for version 1.12) already, and the output looks like this:
Runtime Device Power Management statistics Active Device name 10.0% 06:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd. RTL8101E/RTL8102E PCI Express Fast Ethernet controller
[version 2: fix stat update bugs noticed by Alan Stern] [version 3: rebase to -next and move the sysfs declaration]
Signed-off-by: Arjan van de Ven arjan@linux.intel.com Signed-off-by: Rafael J. Wysocki rjw@sisk.pl (cherry picked from commit 8d4b9d1bfef117862a2889dec4dac227068544c9)
Conflicts:
drivers/base/power/sysfs.c --- drivers/base/power/runtime.c | 54 ++++++++++++++++++++++++++++++---- drivers/base/power/sysfs.c | 64 +++++++++++++++++++++++++++++++++++++++++- include/linux/pm.h | 6 ++++ 3 files changed, 116 insertions(+), 8 deletions(-)
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index b0ec0e9..b78c401 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -123,6 +123,45 @@ int pm_runtime_idle(struct device *dev) } EXPORT_SYMBOL_GPL(pm_runtime_idle);
+ +/** + * update_pm_runtime_accounting - Update the time accounting of power states + * @dev: Device to update the accounting for + * + * In order to be able to have time accounting of the various power states + * (as used by programs such as PowerTOP to show the effectiveness of runtime + * PM), we need to track the time spent in each state. + * update_pm_runtime_accounting must be called each time before the + * runtime_status field is updated, to account the time in the old state + * correctly. + */ +void update_pm_runtime_accounting(struct device *dev) +{ + unsigned long now = jiffies; + int delta; + + delta = now - dev->power.accounting_timestamp; + + if (delta < 0) + delta = 0; + + dev->power.accounting_timestamp = now; + + if (dev->power.disable_depth > 0) + return; + + if (dev->power.runtime_status == RPM_SUSPENDED) + dev->power.suspended_jiffies += delta; + else + dev->power.active_jiffies += delta; +} + +static void __update_runtime_status(struct device *dev, enum rpm_status status) +{ + update_pm_runtime_accounting(dev); + dev->power.runtime_status = status; +} + /** * __pm_runtime_suspend - Carry out run-time suspend of given device. * @dev: Device to suspend. @@ -197,7 +236,7 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq) goto repeat; }
- dev->power.runtime_status = RPM_SUSPENDING; + __update_runtime_status(dev, RPM_SUSPENDING); dev->power.deferred_resume = false;
if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_suspend) { @@ -228,7 +267,7 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq) }
if (retval) { - dev->power.runtime_status = RPM_ACTIVE; + __update_runtime_status(dev, RPM_ACTIVE); if (retval == -EAGAIN || retval == -EBUSY) { if (dev->power.timer_expires == 0) notify = true; @@ -237,7 +276,7 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq) pm_runtime_cancel_pending(dev); } } else { - dev->power.runtime_status = RPM_SUSPENDED; + __update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_deactivate_timer(dev);
if (dev->parent) { @@ -381,7 +420,7 @@ int __pm_runtime_resume(struct device *dev, bool from_wq) goto repeat; }
- dev->power.runtime_status = RPM_RESUMING; + __update_runtime_status(dev, RPM_RESUMING);
if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_resume) { spin_unlock_irq(&dev->power.lock); @@ -411,10 +450,10 @@ int __pm_runtime_resume(struct device *dev, bool from_wq) }
if (retval) { - dev->power.runtime_status = RPM_SUSPENDED; + __update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_cancel_pending(dev); } else { - dev->power.runtime_status = RPM_ACTIVE; + __update_runtime_status(dev, RPM_ACTIVE); if (parent) atomic_inc(&parent->power.child_count); } @@ -848,7 +887,7 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status) }
out_set: - dev->power.runtime_status = status; + __update_runtime_status(dev, status); dev->power.runtime_error = 0; out: spin_unlock_irqrestore(&dev->power.lock, flags); @@ -1077,6 +1116,7 @@ void pm_runtime_init(struct device *dev) dev->power.request_pending = false; dev->power.request = RPM_REQ_NONE; dev->power.deferred_resume = false; + dev->power.accounting_timestamp = jiffies; INIT_WORK(&dev->power.work, pm_runtime_work);
dev->power.timer_expires = 0; diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c index a4c33bc..c637cb4 100644 --- a/drivers/base/power/sysfs.c +++ b/drivers/base/power/sysfs.c @@ -6,6 +6,7 @@ #include <linux/string.h> #include <linux/pm_runtime.h> #include <asm/atomic.h> +#include <linux/jiffies.h> #include "power.h"
/* @@ -108,6 +109,65 @@ static ssize_t control_store(struct device * dev, struct device_attribute *attr, }
static DEVICE_ATTR(control, 0644, control_show, control_store); + +static ssize_t rtpm_active_time_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int ret; + spin_lock_irq(&dev->power.lock); + update_pm_runtime_accounting(dev); + ret = sprintf(buf, "%i\n", jiffies_to_msecs(dev->power.active_jiffies)); + spin_unlock_irq(&dev->power.lock); + return ret; +} + +static DEVICE_ATTR(runtime_active_time, 0444, rtpm_active_time_show, NULL); + +static ssize_t rtpm_suspended_time_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int ret; + spin_lock_irq(&dev->power.lock); + update_pm_runtime_accounting(dev); + ret = sprintf(buf, "%i\n", + jiffies_to_msecs(dev->power.suspended_jiffies)); + spin_unlock_irq(&dev->power.lock); + return ret; +} + +static DEVICE_ATTR(runtime_suspended_time, 0444, rtpm_suspended_time_show, NULL); + +static ssize_t rtpm_status_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + const char *p; + + if (dev->power.runtime_error) { + p = "error\n"; + } else if (dev->power.disable_depth) { + p = "unsupported\n"; + } else { + switch (dev->power.runtime_status) { + case RPM_SUSPENDED: + p = "suspended\n"; + break; + case RPM_SUSPENDING: + p = "suspending\n"; + break; + case RPM_RESUMING: + p = "resuming\n"; + break; + case RPM_ACTIVE: + p = "active\n"; + break; + default: + return -EIO; + } + } + return sprintf(buf, p); +} + +static DEVICE_ATTR(runtime_status, 0444, rtpm_status_show, NULL); #endif
static ssize_t @@ -228,6 +288,9 @@ static DEVICE_ATTR(async, 0644, async_show, async_store); static struct attribute * power_attrs[] = { #ifdef CONFIG_PM_RUNTIME &dev_attr_control.attr, + &dev_attr_runtime_status.attr, + &dev_attr_runtime_suspended_time.attr, + &dev_attr_runtime_active_time.attr, #endif &dev_attr_wakeup.attr, #ifdef CONFIG_PM_ADVANCED_DEBUG @@ -235,7 +298,6 @@ static struct attribute * power_attrs[] = { #ifdef CONFIG_PM_RUNTIME &dev_attr_runtime_usage.attr, &dev_attr_runtime_active_kids.attr, - &dev_attr_runtime_status.attr, &dev_attr_runtime_enabled.attr, #endif #endif diff --git a/include/linux/pm.h b/include/linux/pm.h index 8e258c7..dca597f 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -476,9 +476,15 @@ struct dev_pm_info { enum rpm_request request; enum rpm_status runtime_status; int runtime_error; + unsigned long active_jiffies; + unsigned long suspended_jiffies; + unsigned long accounting_timestamp; #endif };
+extern void update_pm_runtime_accounting(struct device *dev); + + /* * The PM_EVENT_ messages are also used by drivers implementing the legacy * suspend framework, based on the ->suspend() and ->resume() callbacks common
From: Shen Yong b00984@freescale.com
This patch adds state accounting functionality to the AHCI driver for PowerTOP to use. The parts of the patch are 1) the sysfs logic of exposing the stats for each state in sysfs 2) the basic accounting logic that gets update on link change interrupts (or when the user accesses the info from sysfs) 3) a "accounting enable" flag; in order to get the accounting to work, the driver needs to get phyrdy interrupts on link status changes. Normally and currently this is disabled by the driver when ALPM is on (to reduce overhead); when PowerTOP is running this will need to be on to get usable statistics... hence the sysfs tunable.
The PowerTOP output currently looks like this:
Recent SATA AHCI link activity statistics Active Partial Slumber Device name 0.5% 99.5% 0.0% host0
(work to resolve "host0" to a more human readable name is in progress)
Signed-off-by: Arjan van de Ven arjan@linux.intel.com --- drivers/ata/ahci.h | 15 ++++ drivers/ata/libahci.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 203 insertions(+), 2 deletions(-)
diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h index 7113c57..6a3a291 100644 --- a/drivers/ata/ahci.h +++ b/drivers/ata/ahci.h @@ -261,6 +261,13 @@ struct ahci_em_priv { unsigned long led_state; };
+enum ahci_port_states { + AHCI_PORT_NOLINK = 0, + AHCI_PORT_ACTIVE = 1, + AHCI_PORT_PARTIAL = 2, + AHCI_PORT_SLUMBER = 3 +}; + struct ahci_port_priv { struct ata_link *active_link; struct ahci_cmd_hdr *cmd_slot; @@ -279,6 +286,14 @@ struct ahci_port_priv { int fbs_last_dev; /* save FBS.DEV of last FIS */ /* enclosure management info per PM slot */ struct ahci_em_priv em_priv[EM_MAX_SLOTS]; + + /* ALPM accounting state and stats */ + unsigned int accounting_active:1; + u64 active_jiffies; + u64 partial_jiffies; + u64 slumber_jiffies; + int previous_state; + int previous_jiffies; };
struct ahci_host_priv { diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index 81e772a..c3250ee 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -59,6 +59,20 @@ MODULE_PARM_DESC(ignore_sss, "Ignore staggered spinup flag (0=don't ignore, 1=ig static int ahci_enable_alpm(struct ata_port *ap, enum link_pm policy); static void ahci_disable_alpm(struct ata_port *ap); +static ssize_t ahci_alpm_show_active(struct device *dev, + struct device_attribute *attr, char *buf); +static ssize_t ahci_alpm_show_slumber(struct device *dev, + struct device_attribute *attr, char *buf); +static ssize_t ahci_alpm_show_partial(struct device *dev, + struct device_attribute *attr, char *buf); + +static ssize_t ahci_alpm_show_accounting(struct device *dev, + struct device_attribute *attr, char *buf); + +static ssize_t ahci_alpm_set_accounting(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count); + static ssize_t ahci_led_show(struct ata_port *ap, char *buf); static ssize_t ahci_led_store(struct ata_port *ap, const char *buf, size_t size); @@ -118,6 +132,12 @@ static DEVICE_ATTR(ahci_host_caps, S_IRUGO, ahci_show_host_caps, NULL); static DEVICE_ATTR(ahci_host_cap2, S_IRUGO, ahci_show_host_cap2, NULL); static DEVICE_ATTR(ahci_host_version, S_IRUGO, ahci_show_host_version, NULL); static DEVICE_ATTR(ahci_port_cmd, S_IRUGO, ahci_show_port_cmd, NULL); +static DEVICE_ATTR(ahci_alpm_active, S_IRUGO, ahci_alpm_show_active, NULL); +static DEVICE_ATTR(ahci_alpm_partial, S_IRUGO, ahci_alpm_show_partial, NULL); +static DEVICE_ATTR(ahci_alpm_slumber, S_IRUGO, ahci_alpm_show_slumber, NULL); +static DEVICE_ATTR(ahci_alpm_accounting, S_IRUGO | S_IWUSR, + ahci_alpm_show_accounting, ahci_alpm_set_accounting); + static DEVICE_ATTR(em_buffer, S_IWUSR | S_IRUGO, ahci_read_em_buffer, ahci_store_em_buffer);
@@ -129,6 +149,10 @@ static struct device_attribute *ahci_shost_attrs[] = { &dev_attr_ahci_host_cap2, &dev_attr_ahci_host_version, &dev_attr_ahci_port_cmd, + &dev_attr_ahci_alpm_active, + &dev_attr_ahci_alpm_partial, + &dev_attr_ahci_alpm_slumber, + &dev_attr_ahci_alpm_accounting, &dev_attr_em_buffer, NULL }; @@ -734,9 +758,14 @@ static int ahci_enable_alpm(struct ata_port *ap, * getting woken up due to spurious phy ready interrupts * TBD - Hot plug should be done via polling now, is * that even supported? + * + * However, when accounting_active is set, we do want + * the interrupts for accounting purposes. */ - pp->intr_mask &= ~PORT_IRQ_PHYRDY; - writel(pp->intr_mask, port_mmio + PORT_IRQ_MASK); + if (!pp->accounting_active) { + pp->intr_mask &= ~PORT_IRQ_PHYRDY; + writel(pp->intr_mask, port_mmio + PORT_IRQ_MASK); + }
/* * Set a flag to indicate that we should ignore all PhyRdy @@ -1645,6 +1674,162 @@ static void ahci_error_intr(struct ata_port *ap, u32 irq_stat) ata_port_abort(ap); }
+static int get_current_alpm_state(struct ata_port *ap) +{ + u32 status = 0; + + ahci_scr_read(&ap->link, SCR_STATUS, &status); + + /* link status is in bits 11-8 */ + status = status >> 8; + status = status & 0x7; + + if (status == 6) + return AHCI_PORT_SLUMBER; + if (status == 2) + return AHCI_PORT_PARTIAL; + if (status == 1) + return AHCI_PORT_ACTIVE; + return AHCI_PORT_NOLINK; +} + +static void account_alpm_stats(struct ata_port *ap) +{ + struct ahci_port_priv *pp; + + int new_state; + u64 new_jiffies, jiffies_delta; + + if (ap == NULL) + return; + pp = ap->private_data; + + if (!pp) return; + + new_state = get_current_alpm_state(ap); + new_jiffies = jiffies; + + jiffies_delta = new_jiffies - pp->previous_jiffies; + + switch (pp->previous_state) { + case AHCI_PORT_NOLINK: + pp->active_jiffies = 0; + pp->partial_jiffies = 0; + pp->slumber_jiffies = 0; + break; + case AHCI_PORT_ACTIVE: + pp->active_jiffies += jiffies_delta; + break; + case AHCI_PORT_PARTIAL: + pp->partial_jiffies += jiffies_delta; + break; + case AHCI_PORT_SLUMBER: + pp->slumber_jiffies += jiffies_delta; + break; + default: + break; + } + pp->previous_state = new_state; + pp->previous_jiffies = new_jiffies; +} + +static ssize_t ahci_alpm_show_active(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct ata_port *ap = ata_shost_to_port(shost); + struct ahci_port_priv *pp; + + if (!ap || ata_port_is_dummy(ap)) + return -EINVAL; + pp = ap->private_data; + account_alpm_stats(ap); + + return sprintf(buf, "%u\n", jiffies_to_msecs(pp->active_jiffies)); +} + +static ssize_t ahci_alpm_show_partial(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct ata_port *ap = ata_shost_to_port(shost); + struct ahci_port_priv *pp; + + if (!ap || ata_port_is_dummy(ap)) + return -EINVAL; + + pp = ap->private_data; + account_alpm_stats(ap); + + return sprintf(buf, "%u\n", jiffies_to_msecs(pp->partial_jiffies)); +} + +static ssize_t ahci_alpm_show_slumber(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct ata_port *ap = ata_shost_to_port(shost); + struct ahci_port_priv *pp; + + if (!ap || ata_port_is_dummy(ap)) + return -EINVAL; + + pp = ap->private_data; + + account_alpm_stats(ap); + + return sprintf(buf, "%u\n", jiffies_to_msecs(pp->slumber_jiffies)); +} + +static ssize_t ahci_alpm_show_accounting(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct ata_port *ap = ata_shost_to_port(shost); + struct ahci_port_priv *pp; + + if (!ap || ata_port_is_dummy(ap)) + return -EINVAL; + + pp = ap->private_data; + + return sprintf(buf, "%u\n", pp->accounting_active); +} + +static ssize_t ahci_alpm_set_accounting(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long flags; + struct Scsi_Host *shost = class_to_shost(dev); + struct ata_port *ap = ata_shost_to_port(shost); + struct ahci_port_priv *pp; + void __iomem *port_mmio; + + if (!ap || ata_port_is_dummy(ap)) + return 1; + + pp = ap->private_data; + port_mmio = ahci_port_base(ap); + + if (!pp) + return 1; + if (buf[0] == '0') + pp->accounting_active = 0; + if (buf[0] == '1') + pp->accounting_active = 1; + + /* we need to enable the PHYRDY interrupt when we want accounting */ + if (pp->accounting_active) { + spin_lock_irqsave(ap->lock, flags); + pp->intr_mask |= PORT_IRQ_PHYRDY; + writel(pp->intr_mask, port_mmio + PORT_IRQ_MASK); + spin_unlock_irqrestore(ap->lock, flags); + } + return count; +} + + static void ahci_port_intr(struct ata_port *ap) { void __iomem *port_mmio = ahci_port_base(ap); @@ -1670,6 +1855,7 @@ static void ahci_port_intr(struct ata_port *ap) if ((hpriv->flags & AHCI_HFLAG_NO_HOTPLUG) && (status & PORT_IRQ_PHYRDY)) { status &= ~PORT_IRQ_PHYRDY; + account_alpm_stats(ap); ahci_scr_write(&ap->link, SCR_ERROR, ((1 << 16) | (1 << 18))); }
From: Shen Yong b00984@freescale.com
The trace point follows the same logic/style as the block_dump code and pretty much dumps the same data, just not to dmesg (and thus to /var/log/messages) but via the trace events streams.
Signed-of-by: Arjan van de Ven arjan@linux.intel.com --- fs/fs-writeback.c | 4 +++ fs/inode.c | 4 +++ include/trace/events/vfs.h | 53 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 0 deletions(-) create mode 100644 include/trace/events/vfs.h
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d5be169..6a2af1d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -26,6 +26,7 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> +#include <trace/events/vfs.h> #include "internal.h"
#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) @@ -900,6 +901,9 @@ void __mark_inode_dirty(struct inode *inode, int flags) sb->s_op->dirty_inode(inode); }
+ if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)) + trace_dirty_inode(inode, current); + /* * make sure that changes are seen by all cpus before we test i_state * -- mikulas diff --git a/fs/inode.c b/fs/inode.c index 722860b..a06184f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1626,3 +1626,7 @@ void inode_init_owner(struct inode *inode, const struct inode *dir, inode->i_mode = mode; } EXPORT_SYMBOL(inode_init_owner); + +#define CREATE_TRACE_POINTS +#include <trace/events/vfs.h> + diff --git a/include/trace/events/vfs.h b/include/trace/events/vfs.h new file mode 100644 index 0000000..21cf9fb --- /dev/null +++ b/include/trace/events/vfs.h @@ -0,0 +1,53 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vfs + +#if !defined(_TRACE_VFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VFS_H + +/* + * Tracepoint for dirtying an inode: + */ +TRACE_EVENT(dirty_inode, + + TP_PROTO(struct inode *inode, struct task_struct *task), + + TP_ARGS(inode, task), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __array( char, dev, 16 ) + __array( char, file, 32 ) + ), + + TP_fast_assign( + if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { + struct dentry *dentry; + const char *name = "?"; + + dentry = d_find_alias(inode); + if (dentry) { + spin_lock(&dentry->d_lock); + name = (const char *) dentry->d_name.name; + } + + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->pid = task->pid; + strlcpy(__entry->file, name, 32); + strlcpy(__entry->dev, inode->i_sb->s_id, 16); + + if (dentry) { + spin_unlock(&dentry->d_lock); + dput(dentry); + } + } + ), + + TP_printk("task=%i (%s) file=%s dev=%s", + __entry->pid, __entry->comm, __entry->file, __entry->dev) +); + +#endif /* _TRACE_VFS_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h>