From: Arjan van de Ven arjan@linux.intel.com
In order for PowerTOP to be able to report how well the new runtime PM is working for the various drivers, the kernel needs to export some basic statistics in sysfs.
This patch adds two sysfs files in the runtime PM domain that expose the total time a device has been active, and the time a device has been suspended.
With this PowerTOP can compute the activity percentage
Active %age = 100 * (delta active) / (delta active + delta suspended)
and present the information to the user.
I've written the PowerTOP code (slated for version 1.12) already, and the output looks like this:
Runtime Device Power Management statistics Active Device name 10.0% 06:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd. RTL8101E/RTL8102E PCI Express Fast Ethernet controller
[version 2: fix stat update bugs noticed by Alan Stern] [version 3: rebase to -next and move the sysfs declaration]
Signed-off-by: Arjan van de Ven arjan@linux.intel.com Signed-off-by: Rafael J. Wysocki rjw@sisk.pl --- drivers/base/power/runtime.c | 54 ++++++++++++++++++++++++++++++---- drivers/base/power/sysfs.c | 65 +++++++++++++++++++++++++++++++++++++++++- include/linux/pm.h | 6 ++++ 3 files changed, 117 insertions(+), 8 deletions(-)
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index b0ec0e9..b78c401 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -123,6 +123,45 @@ int pm_runtime_idle(struct device *dev) } EXPORT_SYMBOL_GPL(pm_runtime_idle);
+ +/** + * update_pm_runtime_accounting - Update the time accounting of power states + * @dev: Device to update the accounting for + * + * In order to be able to have time accounting of the various power states + * (as used by programs such as PowerTOP to show the effectiveness of runtime + * PM), we need to track the time spent in each state. + * update_pm_runtime_accounting must be called each time before the + * runtime_status field is updated, to account the time in the old state + * correctly. + */ +void update_pm_runtime_accounting(struct device *dev) +{ + unsigned long now = jiffies; + int delta; + + delta = now - dev->power.accounting_timestamp; + + if (delta < 0) + delta = 0; + + dev->power.accounting_timestamp = now; + + if (dev->power.disable_depth > 0) + return; + + if (dev->power.runtime_status == RPM_SUSPENDED) + dev->power.suspended_jiffies += delta; + else + dev->power.active_jiffies += delta; +} + +static void __update_runtime_status(struct device *dev, enum rpm_status status) +{ + update_pm_runtime_accounting(dev); + dev->power.runtime_status = status; +} + /** * __pm_runtime_suspend - Carry out run-time suspend of given device. * @dev: Device to suspend. @@ -197,7 +236,7 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq) goto repeat; }
- dev->power.runtime_status = RPM_SUSPENDING; + __update_runtime_status(dev, RPM_SUSPENDING); dev->power.deferred_resume = false;
if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_suspend) { @@ -228,7 +267,7 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq) }
if (retval) { - dev->power.runtime_status = RPM_ACTIVE; + __update_runtime_status(dev, RPM_ACTIVE); if (retval == -EAGAIN || retval == -EBUSY) { if (dev->power.timer_expires == 0) notify = true; @@ -237,7 +276,7 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq) pm_runtime_cancel_pending(dev); } } else { - dev->power.runtime_status = RPM_SUSPENDED; + __update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_deactivate_timer(dev);
if (dev->parent) { @@ -381,7 +420,7 @@ int __pm_runtime_resume(struct device *dev, bool from_wq) goto repeat; }
- dev->power.runtime_status = RPM_RESUMING; + __update_runtime_status(dev, RPM_RESUMING);
if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_resume) { spin_unlock_irq(&dev->power.lock); @@ -411,10 +450,10 @@ int __pm_runtime_resume(struct device *dev, bool from_wq) }
if (retval) { - dev->power.runtime_status = RPM_SUSPENDED; + __update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_cancel_pending(dev); } else { - dev->power.runtime_status = RPM_ACTIVE; + __update_runtime_status(dev, RPM_ACTIVE); if (parent) atomic_inc(&parent->power.child_count); } @@ -848,7 +887,7 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status) }
out_set: - dev->power.runtime_status = status; + __update_runtime_status(dev, status); dev->power.runtime_error = 0; out: spin_unlock_irqrestore(&dev->power.lock, flags); @@ -1077,6 +1116,7 @@ void pm_runtime_init(struct device *dev) dev->power.request_pending = false; dev->power.request = RPM_REQ_NONE; dev->power.deferred_resume = false; + dev->power.accounting_timestamp = jiffies; INIT_WORK(&dev->power.work, pm_runtime_work);
dev->power.timer_expires = 0; diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c index a4c33bc..7ee064f 100644 --- a/drivers/base/power/sysfs.c +++ b/drivers/base/power/sysfs.c @@ -6,6 +6,7 @@ #include <linux/string.h> #include <linux/pm_runtime.h> #include <asm/atomic.h> +#include <linux/jiffies.h> #include "power.h"
/* @@ -108,6 +109,66 @@ static ssize_t control_store(struct device * dev, struct device_attribute *attr, }
static DEVICE_ATTR(control, 0644, control_show, control_store); + +static ssize_t rtpm_active_time_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int ret; + spin_lock_irq(&dev->power.lock); + update_pm_runtime_accounting(dev); + ret = sprintf(buf, "%i\n", jiffies_to_msecs(dev->power.active_jiffies)); + spin_unlock_irq(&dev->power.lock); + return ret; +} + +static DEVICE_ATTR(runtime_active_time, 0444, rtpm_active_time_show, NULL); + +static ssize_t rtpm_suspended_time_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int ret; + spin_lock_irq(&dev->power.lock); + update_pm_runtime_accounting(dev); + ret = sprintf(buf, "%i\n", + jiffies_to_msecs(dev->power.suspended_jiffies)); + spin_unlock_irq(&dev->power.lock); + return ret; +} + +static DEVICE_ATTR(runtime_suspended_time, 0444, rtpm_suspended_time_show, NULL); + +static ssize_t rtpm_status_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + const char *p; + + if (dev->power.runtime_error) { + p = "error\n"; + } else if (dev->power.disable_depth) { + p = "unsupported\n"; + } else { + switch (dev->power.runtime_status) { + case RPM_SUSPENDED: + p = "suspended\n"; + break; + case RPM_SUSPENDING: + p = "suspending\n"; + break; + case RPM_RESUMING: + p = "resuming\n"; + break; + case RPM_ACTIVE: + p = "active\n"; + break; + default: + return -EIO; + } + } + return sprintf(buf, p); +} + +static DEVICE_ATTR(runtime_status, 0444, rtpm_status_show, NULL); +>>>>>>> 8d4b9d1... PM / Runtime: Add runtime PM statistics (v3):drivers/base/power/sysfs.c #endif
static ssize_t @@ -228,6 +289,9 @@ static DEVICE_ATTR(async, 0644, async_show, async_store); static struct attribute * power_attrs[] = { #ifdef CONFIG_PM_RUNTIME &dev_attr_control.attr, + &dev_attr_runtime_status.attr, + &dev_attr_runtime_suspended_time.attr, + &dev_attr_runtime_active_time.attr, #endif &dev_attr_wakeup.attr, #ifdef CONFIG_PM_ADVANCED_DEBUG @@ -235,7 +299,6 @@ static struct attribute * power_attrs[] = { #ifdef CONFIG_PM_RUNTIME &dev_attr_runtime_usage.attr, &dev_attr_runtime_active_kids.attr, - &dev_attr_runtime_status.attr, &dev_attr_runtime_enabled.attr, #endif #endif diff --git a/include/linux/pm.h b/include/linux/pm.h index 8e258c7..dca597f 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -476,9 +476,15 @@ struct dev_pm_info { enum rpm_request request; enum rpm_status runtime_status; int runtime_error; + unsigned long active_jiffies; + unsigned long suspended_jiffies; + unsigned long accounting_timestamp; #endif };
+extern void update_pm_runtime_accounting(struct device *dev); + + /* * The PM_EVENT_ messages are also used by drivers implementing the legacy * suspend framework, based on the ->suspend() and ->resume() callbacks common
From: Arjan van de Ven arjan@linux.intel.com
PowerTOP wants to be able to show the user how effective the ALPM link power management is for the user. ALPM is worth around 0.5W on a quiet link; PowerTOP wants to be able to find cases where the "quiet link" isn't actually quiet.
This patch adds state accounting functionality to the AHCI driver for PowerTOP to use. The parts of the patch are 1) the sysfs logic of exposing the stats for each state in sysfs 2) the basic accounting logic that gets update on link change interrupts (or when the user accesses the info from sysfs) 3) a "accounting enable" flag; in order to get the accounting to work, the driver needs to get phyrdy interrupts on link status changes. Normally and currently this is disabled by the driver when ALPM is on (to reduce overhead); when PowerTOP is running this will need to be on to get usable statistics... hence the sysfs tunable.
The PowerTOP output currently looks like this:
Recent SATA AHCI link activity statistics Active Partial Slumber Device name 0.5% 99.5% 0.0% host0
(work to resolve "host0" to a more human readable name is in progress)
Signed-off-by: Arjan van de Ven arjan@linux.intel.com --- drivers/ata/ahci.h | 15 ++++ drivers/ata/libahci.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 203 insertions(+), 2 deletions(-)
diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h index 7113c57..6a3a291 100644 --- a/drivers/ata/ahci.h +++ b/drivers/ata/ahci.h @@ -261,6 +261,13 @@ struct ahci_em_priv { unsigned long led_state; };
+enum ahci_port_states { + AHCI_PORT_NOLINK = 0, + AHCI_PORT_ACTIVE = 1, + AHCI_PORT_PARTIAL = 2, + AHCI_PORT_SLUMBER = 3 +}; + struct ahci_port_priv { struct ata_link *active_link; struct ahci_cmd_hdr *cmd_slot; @@ -279,6 +286,14 @@ struct ahci_port_priv { int fbs_last_dev; /* save FBS.DEV of last FIS */ /* enclosure management info per PM slot */ struct ahci_em_priv em_priv[EM_MAX_SLOTS]; + + /* ALPM accounting state and stats */ + unsigned int accounting_active:1; + u64 active_jiffies; + u64 partial_jiffies; + u64 slumber_jiffies; + int previous_state; + int previous_jiffies; };
struct ahci_host_priv { diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index 81e772a..c3250ee 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -59,6 +59,20 @@ MODULE_PARM_DESC(ignore_sss, "Ignore staggered spinup flag (0=don't ignore, 1=ig static int ahci_enable_alpm(struct ata_port *ap, enum link_pm policy); static void ahci_disable_alpm(struct ata_port *ap); +static ssize_t ahci_alpm_show_active(struct device *dev, + struct device_attribute *attr, char *buf); +static ssize_t ahci_alpm_show_slumber(struct device *dev, + struct device_attribute *attr, char *buf); +static ssize_t ahci_alpm_show_partial(struct device *dev, + struct device_attribute *attr, char *buf); + +static ssize_t ahci_alpm_show_accounting(struct device *dev, + struct device_attribute *attr, char *buf); + +static ssize_t ahci_alpm_set_accounting(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count); + static ssize_t ahci_led_show(struct ata_port *ap, char *buf); static ssize_t ahci_led_store(struct ata_port *ap, const char *buf, size_t size); @@ -118,6 +132,12 @@ static DEVICE_ATTR(ahci_host_caps, S_IRUGO, ahci_show_host_caps, NULL); static DEVICE_ATTR(ahci_host_cap2, S_IRUGO, ahci_show_host_cap2, NULL); static DEVICE_ATTR(ahci_host_version, S_IRUGO, ahci_show_host_version, NULL); static DEVICE_ATTR(ahci_port_cmd, S_IRUGO, ahci_show_port_cmd, NULL); +static DEVICE_ATTR(ahci_alpm_active, S_IRUGO, ahci_alpm_show_active, NULL); +static DEVICE_ATTR(ahci_alpm_partial, S_IRUGO, ahci_alpm_show_partial, NULL); +static DEVICE_ATTR(ahci_alpm_slumber, S_IRUGO, ahci_alpm_show_slumber, NULL); +static DEVICE_ATTR(ahci_alpm_accounting, S_IRUGO | S_IWUSR, + ahci_alpm_show_accounting, ahci_alpm_set_accounting); + static DEVICE_ATTR(em_buffer, S_IWUSR | S_IRUGO, ahci_read_em_buffer, ahci_store_em_buffer);
@@ -129,6 +149,10 @@ static struct device_attribute *ahci_shost_attrs[] = { &dev_attr_ahci_host_cap2, &dev_attr_ahci_host_version, &dev_attr_ahci_port_cmd, + &dev_attr_ahci_alpm_active, + &dev_attr_ahci_alpm_partial, + &dev_attr_ahci_alpm_slumber, + &dev_attr_ahci_alpm_accounting, &dev_attr_em_buffer, NULL }; @@ -734,9 +758,14 @@ static int ahci_enable_alpm(struct ata_port *ap, * getting woken up due to spurious phy ready interrupts * TBD - Hot plug should be done via polling now, is * that even supported? + * + * However, when accounting_active is set, we do want + * the interrupts for accounting purposes. */ - pp->intr_mask &= ~PORT_IRQ_PHYRDY; - writel(pp->intr_mask, port_mmio + PORT_IRQ_MASK); + if (!pp->accounting_active) { + pp->intr_mask &= ~PORT_IRQ_PHYRDY; + writel(pp->intr_mask, port_mmio + PORT_IRQ_MASK); + }
/* * Set a flag to indicate that we should ignore all PhyRdy @@ -1645,6 +1674,162 @@ static void ahci_error_intr(struct ata_port *ap, u32 irq_stat) ata_port_abort(ap); }
+static int get_current_alpm_state(struct ata_port *ap) +{ + u32 status = 0; + + ahci_scr_read(&ap->link, SCR_STATUS, &status); + + /* link status is in bits 11-8 */ + status = status >> 8; + status = status & 0x7; + + if (status == 6) + return AHCI_PORT_SLUMBER; + if (status == 2) + return AHCI_PORT_PARTIAL; + if (status == 1) + return AHCI_PORT_ACTIVE; + return AHCI_PORT_NOLINK; +} + +static void account_alpm_stats(struct ata_port *ap) +{ + struct ahci_port_priv *pp; + + int new_state; + u64 new_jiffies, jiffies_delta; + + if (ap == NULL) + return; + pp = ap->private_data; + + if (!pp) return; + + new_state = get_current_alpm_state(ap); + new_jiffies = jiffies; + + jiffies_delta = new_jiffies - pp->previous_jiffies; + + switch (pp->previous_state) { + case AHCI_PORT_NOLINK: + pp->active_jiffies = 0; + pp->partial_jiffies = 0; + pp->slumber_jiffies = 0; + break; + case AHCI_PORT_ACTIVE: + pp->active_jiffies += jiffies_delta; + break; + case AHCI_PORT_PARTIAL: + pp->partial_jiffies += jiffies_delta; + break; + case AHCI_PORT_SLUMBER: + pp->slumber_jiffies += jiffies_delta; + break; + default: + break; + } + pp->previous_state = new_state; + pp->previous_jiffies = new_jiffies; +} + +static ssize_t ahci_alpm_show_active(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct ata_port *ap = ata_shost_to_port(shost); + struct ahci_port_priv *pp; + + if (!ap || ata_port_is_dummy(ap)) + return -EINVAL; + pp = ap->private_data; + account_alpm_stats(ap); + + return sprintf(buf, "%u\n", jiffies_to_msecs(pp->active_jiffies)); +} + +static ssize_t ahci_alpm_show_partial(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct ata_port *ap = ata_shost_to_port(shost); + struct ahci_port_priv *pp; + + if (!ap || ata_port_is_dummy(ap)) + return -EINVAL; + + pp = ap->private_data; + account_alpm_stats(ap); + + return sprintf(buf, "%u\n", jiffies_to_msecs(pp->partial_jiffies)); +} + +static ssize_t ahci_alpm_show_slumber(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct ata_port *ap = ata_shost_to_port(shost); + struct ahci_port_priv *pp; + + if (!ap || ata_port_is_dummy(ap)) + return -EINVAL; + + pp = ap->private_data; + + account_alpm_stats(ap); + + return sprintf(buf, "%u\n", jiffies_to_msecs(pp->slumber_jiffies)); +} + +static ssize_t ahci_alpm_show_accounting(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct ata_port *ap = ata_shost_to_port(shost); + struct ahci_port_priv *pp; + + if (!ap || ata_port_is_dummy(ap)) + return -EINVAL; + + pp = ap->private_data; + + return sprintf(buf, "%u\n", pp->accounting_active); +} + +static ssize_t ahci_alpm_set_accounting(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long flags; + struct Scsi_Host *shost = class_to_shost(dev); + struct ata_port *ap = ata_shost_to_port(shost); + struct ahci_port_priv *pp; + void __iomem *port_mmio; + + if (!ap || ata_port_is_dummy(ap)) + return 1; + + pp = ap->private_data; + port_mmio = ahci_port_base(ap); + + if (!pp) + return 1; + if (buf[0] == '0') + pp->accounting_active = 0; + if (buf[0] == '1') + pp->accounting_active = 1; + + /* we need to enable the PHYRDY interrupt when we want accounting */ + if (pp->accounting_active) { + spin_lock_irqsave(ap->lock, flags); + pp->intr_mask |= PORT_IRQ_PHYRDY; + writel(pp->intr_mask, port_mmio + PORT_IRQ_MASK); + spin_unlock_irqrestore(ap->lock, flags); + } + return count; +} + + static void ahci_port_intr(struct ata_port *ap) { void __iomem *port_mmio = ahci_port_base(ap); @@ -1670,6 +1855,7 @@ static void ahci_port_intr(struct ata_port *ap) if ((hpriv->flags & AHCI_HFLAG_NO_HOTPLUG) && (status & PORT_IRQ_PHYRDY)) { status &= ~PORT_IRQ_PHYRDY; + account_alpm_stats(ap); ahci_scr_write(&ap->link, SCR_ERROR, ((1 << 16) | (1 << 18))); }
From: Arjan van de Ven arjan@linux.intel.com
PowerTOP would like to be able to show who is keeping the disk busy by dirtying data. The most logical spot for this is in the vfs in the mark_inode_dirty() function, doing this on the block level is not possible because by the time the IO hits the block layer the guilty party can no longer be found ("kjournald" and "pdflush" are not useful answers to "who caused this file to be dirty).
The trace point follows the same logic/style as the block_dump code and pretty much dumps the same data, just not to dmesg (and thus to /var/log/messages) but via the trace events streams.
Signed-of-by: Arjan van de Ven arjan@linux.intel.com --- fs/fs-writeback.c | 4 +++ fs/inode.c | 4 +++ include/trace/events/vfs.h | 53 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 0 deletions(-) create mode 100644 include/trace/events/vfs.h
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d5be169..6a2af1d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -26,6 +26,7 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> +#include <trace/events/vfs.h> #include "internal.h"
#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) @@ -900,6 +901,9 @@ void __mark_inode_dirty(struct inode *inode, int flags) sb->s_op->dirty_inode(inode); }
+ if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)) + trace_dirty_inode(inode, current); + /* * make sure that changes are seen by all cpus before we test i_state * -- mikulas diff --git a/fs/inode.c b/fs/inode.c index 722860b..a06184f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1626,3 +1626,7 @@ void inode_init_owner(struct inode *inode, const struct inode *dir, inode->i_mode = mode; } EXPORT_SYMBOL(inode_init_owner); + +#define CREATE_TRACE_POINTS +#include <trace/events/vfs.h> + diff --git a/include/trace/events/vfs.h b/include/trace/events/vfs.h new file mode 100644 index 0000000..21cf9fb --- /dev/null +++ b/include/trace/events/vfs.h @@ -0,0 +1,53 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vfs + +#if !defined(_TRACE_VFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VFS_H + +/* + * Tracepoint for dirtying an inode: + */ +TRACE_EVENT(dirty_inode, + + TP_PROTO(struct inode *inode, struct task_struct *task), + + TP_ARGS(inode, task), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __array( char, dev, 16 ) + __array( char, file, 32 ) + ), + + TP_fast_assign( + if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { + struct dentry *dentry; + const char *name = "?"; + + dentry = d_find_alias(inode); + if (dentry) { + spin_lock(&dentry->d_lock); + name = (const char *) dentry->d_name.name; + } + + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->pid = task->pid; + strlcpy(__entry->file, name, 32); + strlcpy(__entry->dev, inode->i_sb->s_id, 16); + + if (dentry) { + spin_unlock(&dentry->d_lock); + dput(dentry); + } + } + ), + + TP_printk("task=%i (%s) file=%s dev=%s", + __entry->pid, __entry->comm, __entry->file, __entry->dev) +); + +#endif /* _TRACE_VFS_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h>
This patch doesn't apply cleanly to the current Ubuntu Maverick linux kernel as it appears we're already carrying a version of this patch:
commit ce3cdc8812cc8a6f5645a9c372c7d6227676a7e5 Author: Arjan van de Ven arjan@linux.intel.com Date: Sun Oct 25 15:37:04 2009 -0700
UBUNTU: SAUCE: vfs: Add a trace point in the mark_inode_dirty function
There's only a 1 line discrepancy (see below) between what we currently have in the Maverick Ubuntu kernel and the patch inlined below. Care to re-send an updated patch to only add the additional one line change? Care to also include your SOB on the patch?
Thanks, Leann
On Thu, 2010-09-09 at 19:20 +0800, yong.shen@linaro.org wrote:
From: Arjan van de Ven arjan@linux.intel.com
PowerTOP would like to be able to show who is keeping the disk busy by dirtying data. The most logical spot for this is in the vfs in the mark_inode_dirty() function, doing this on the block level is not possible because by the time the IO hits the block layer the guilty party can no longer be found ("kjournald" and "pdflush" are not useful answers to "who caused this file to be dirty).
The trace point follows the same logic/style as the block_dump code and pretty much dumps the same data, just not to dmesg (and thus to /var/log/messages) but via the trace events streams.
Signed-of-by: Arjan van de Ven arjan@linux.intel.com
fs/fs-writeback.c | 4 +++ fs/inode.c | 4 +++ include/trace/events/vfs.h | 53 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 0 deletions(-) create mode 100644 include/trace/events/vfs.h
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d5be169..6a2af1d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -26,6 +26,7 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> +#include <trace/events/vfs.h> #include "internal.h" #define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) @@ -900,6 +901,9 @@ void __mark_inode_dirty(struct inode *inode, int flags) sb->s_op->dirty_inode(inode); }
- if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES))
Looks like the above line is the only additional change from what we already have.
trace_dirty_inode(inode, current);
- /*
- make sure that changes are seen by all cpus before we test i_state
- -- mikulas
diff --git a/fs/inode.c b/fs/inode.c index 722860b..a06184f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1626,3 +1626,7 @@ void inode_init_owner(struct inode *inode, const struct inode *dir, inode->i_mode = mode; } EXPORT_SYMBOL(inode_init_owner);
+#define CREATE_TRACE_POINTS +#include <trace/events/vfs.h>
diff --git a/include/trace/events/vfs.h b/include/trace/events/vfs.h new file mode 100644 index 0000000..21cf9fb --- /dev/null +++ b/include/trace/events/vfs.h @@ -0,0 +1,53 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vfs
+#if !defined(_TRACE_VFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VFS_H
+/*
- Tracepoint for dirtying an inode:
- */
+TRACE_EVENT(dirty_inode,
- TP_PROTO(struct inode *inode, struct task_struct *task),
- TP_ARGS(inode, task),
- TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__array( char, dev, 16 )
__array( char, file, 32 )
- ),
- TP_fast_assign(
if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
struct dentry *dentry;
const char *name = "?";
dentry = d_find_alias(inode);
if (dentry) {
spin_lock(&dentry->d_lock);
name = (const char *) dentry->d_name.name;
}
memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
__entry->pid = task->pid;
strlcpy(__entry->file, name, 32);
strlcpy(__entry->dev, inode->i_sb->s_id, 16);
if (dentry) {
spin_unlock(&dentry->d_lock);
dput(dentry);
}
}
- ),
- TP_printk("task=%i (%s) file=%s dev=%s",
__entry->pid, __entry->comm, __entry->file, __entry->dev)
+);
+#endif /* _TRACE_VFS_H */
+/* This part must be outside protection */
+#include <trace/define_trace.h>
1.6.3.3
Yong,
You forgot to add a note as to why you need these patches applied :)
Reviewers - these patches expose some new information related to runtime PM that powertop can show.
This patch -1/3, is already in 2.6.36-rc. The other two aren't upstream.
Please consider applying to the Ubuntu kernel.
Regards, Amit
On Thu, Sep 9, 2010 at 2:20 PM, yong.shen@linaro.org wrote:
From: Arjan van de Ven arjan@linux.intel.com
In order for PowerTOP to be able to report how well the new runtime PM is working for the various drivers, the kernel needs to export some basic statistics in sysfs.
This patch adds two sysfs files in the runtime PM domain that expose the total time a device has been active, and the time a device has been suspended.
With this PowerTOP can compute the activity percentage
Active %age = 100 * (delta active) / (delta active + delta suspended)
and present the information to the user.
I've written the PowerTOP code (slated for version 1.12) already, and the output looks like this:
Runtime Device Power Management statistics Active Device name 10.0% 06:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd. RTL8101E/RTL8102E PCI Express Fast Ethernet controller
[version 2: fix stat update bugs noticed by Alan Stern] [version 3: rebase to -next and move the sysfs declaration]
Signed-off-by: Arjan van de Ven arjan@linux.intel.com Signed-off-by: Rafael J. Wysocki rjw@sisk.pl
drivers/base/power/runtime.c | 54 ++++++++++++++++++++++++++++++---- drivers/base/power/sysfs.c | 65 +++++++++++++++++++++++++++++++++++++++++- include/linux/pm.h | 6 ++++ 3 files changed, 117 insertions(+), 8 deletions(-)
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index b0ec0e9..b78c401 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -123,6 +123,45 @@ int pm_runtime_idle(struct device *dev) } EXPORT_SYMBOL_GPL(pm_runtime_idle);
+/**
- update_pm_runtime_accounting - Update the time accounting of power states
- @dev: Device to update the accounting for
- In order to be able to have time accounting of the various power states
- (as used by programs such as PowerTOP to show the effectiveness of runtime
- PM), we need to track the time spent in each state.
- update_pm_runtime_accounting must be called each time before the
- runtime_status field is updated, to account the time in the old state
- correctly.
- */
+void update_pm_runtime_accounting(struct device *dev) +{
- unsigned long now = jiffies;
- int delta;
- delta = now - dev->power.accounting_timestamp;
- if (delta < 0)
- delta = 0;
- dev->power.accounting_timestamp = now;
- if (dev->power.disable_depth > 0)
- return;
- if (dev->power.runtime_status == RPM_SUSPENDED)
- dev->power.suspended_jiffies += delta;
- else
- dev->power.active_jiffies += delta;
+}
+static void __update_runtime_status(struct device *dev, enum rpm_status status) +{
- update_pm_runtime_accounting(dev);
- dev->power.runtime_status = status;
+}
/** * __pm_runtime_suspend - Carry out run-time suspend of given device. * @dev: Device to suspend. @@ -197,7 +236,7 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq) goto repeat; }
- dev->power.runtime_status = RPM_SUSPENDING;
- __update_runtime_status(dev, RPM_SUSPENDING);
dev->power.deferred_resume = false;
if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_suspend) { @@ -228,7 +267,7 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq) }
if (retval) {
- dev->power.runtime_status = RPM_ACTIVE;
- __update_runtime_status(dev, RPM_ACTIVE);
if (retval == -EAGAIN || retval == -EBUSY) { if (dev->power.timer_expires == 0) notify = true; @@ -237,7 +276,7 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq) pm_runtime_cancel_pending(dev); } } else {
- dev->power.runtime_status = RPM_SUSPENDED;
- __update_runtime_status(dev, RPM_SUSPENDED);
pm_runtime_deactivate_timer(dev);
if (dev->parent) { @@ -381,7 +420,7 @@ int __pm_runtime_resume(struct device *dev, bool from_wq) goto repeat; }
- dev->power.runtime_status = RPM_RESUMING;
- __update_runtime_status(dev, RPM_RESUMING);
if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_resume) { spin_unlock_irq(&dev->power.lock); @@ -411,10 +450,10 @@ int __pm_runtime_resume(struct device *dev, bool from_wq) }
if (retval) {
- dev->power.runtime_status = RPM_SUSPENDED;
- __update_runtime_status(dev, RPM_SUSPENDED);
pm_runtime_cancel_pending(dev); } else {
- dev->power.runtime_status = RPM_ACTIVE;
- __update_runtime_status(dev, RPM_ACTIVE);
if (parent) atomic_inc(&parent->power.child_count); } @@ -848,7 +887,7 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status) }
out_set:
- dev->power.runtime_status = status;
- __update_runtime_status(dev, status);
dev->power.runtime_error = 0; out: spin_unlock_irqrestore(&dev->power.lock, flags); @@ -1077,6 +1116,7 @@ void pm_runtime_init(struct device *dev) dev->power.request_pending = false; dev->power.request = RPM_REQ_NONE; dev->power.deferred_resume = false;
- dev->power.accounting_timestamp = jiffies;
INIT_WORK(&dev->power.work, pm_runtime_work);
dev->power.timer_expires = 0; diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c index a4c33bc..7ee064f 100644 --- a/drivers/base/power/sysfs.c +++ b/drivers/base/power/sysfs.c @@ -6,6 +6,7 @@ #include <linux/string.h> #include <linux/pm_runtime.h> #include <asm/atomic.h> +#include <linux/jiffies.h> #include "power.h"
/* @@ -108,6 +109,66 @@ static ssize_t control_store(struct device * dev, struct device_attribute *attr, }
static DEVICE_ATTR(control, 0644, control_show, control_store);
+static ssize_t rtpm_active_time_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+{
- int ret;
- spin_lock_irq(&dev->power.lock);
- update_pm_runtime_accounting(dev);
- ret = sprintf(buf, "%i\n", jiffies_to_msecs(dev->power.active_jiffies));
- spin_unlock_irq(&dev->power.lock);
- return ret;
+}
+static DEVICE_ATTR(runtime_active_time, 0444, rtpm_active_time_show, NULL);
+static ssize_t rtpm_suspended_time_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+{
- int ret;
- spin_lock_irq(&dev->power.lock);
- update_pm_runtime_accounting(dev);
- ret = sprintf(buf, "%i\n",
- jiffies_to_msecs(dev->power.suspended_jiffies));
- spin_unlock_irq(&dev->power.lock);
- return ret;
+}
+static DEVICE_ATTR(runtime_suspended_time, 0444, rtpm_suspended_time_show, NULL);
+static ssize_t rtpm_status_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+{
- const char *p;
- if (dev->power.runtime_error) {
- p = "error\n";
- } else if (dev->power.disable_depth) {
- p = "unsupported\n";
- } else {
- switch (dev->power.runtime_status) {
- case RPM_SUSPENDED:
- p = "suspended\n";
- break;
- case RPM_SUSPENDING:
- p = "suspending\n";
- break;
- case RPM_RESUMING:
- p = "resuming\n";
- break;
- case RPM_ACTIVE:
- p = "active\n";
- break;
- default:
- return -EIO;
- }
- }
- return sprintf(buf, p);
+}
+static DEVICE_ATTR(runtime_status, 0444, rtpm_status_show, NULL); +>>>>>>> 8d4b9d1... PM / Runtime: Add runtime PM statistics (v3):drivers/base/power/sysfs.c #endif
static ssize_t @@ -228,6 +289,9 @@ static DEVICE_ATTR(async, 0644, async_show, async_store); static struct attribute * power_attrs[] = { #ifdef CONFIG_PM_RUNTIME &dev_attr_control.attr,
- &dev_attr_runtime_status.attr,
- &dev_attr_runtime_suspended_time.attr,
- &dev_attr_runtime_active_time.attr,
#endif &dev_attr_wakeup.attr, #ifdef CONFIG_PM_ADVANCED_DEBUG @@ -235,7 +299,6 @@ static struct attribute * power_attrs[] = { #ifdef CONFIG_PM_RUNTIME &dev_attr_runtime_usage.attr, &dev_attr_runtime_active_kids.attr,
- &dev_attr_runtime_status.attr,
&dev_attr_runtime_enabled.attr, #endif #endif diff --git a/include/linux/pm.h b/include/linux/pm.h index 8e258c7..dca597f 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -476,9 +476,15 @@ struct dev_pm_info { enum rpm_request request; enum rpm_status runtime_status; int runtime_error;
- unsigned long active_jiffies;
- unsigned long suspended_jiffies;
- unsigned long accounting_timestamp;
#endif };
+extern void update_pm_runtime_accounting(struct device *dev);
/* * The PM_EVENT_ messages are also used by drivers implementing the legacy * suspend framework, based on the ->suspend() and ->resume() callbacks common -- 1.6.3.3
On Thu, 2010-09-09 at 14:30 +0300, Amit Kucheria wrote:
Yong,
You forgot to add a note as to why you need these patches applied :)
Reviewers - these patches expose some new information related to runtime PM that powertop can show.
This patch -1/3, is already in 2.6.36-rc. The other two aren't upstream.
Please consider applying to the Ubuntu kernel.
As Amit noted, this appears to be upstream:
commit 8d4b9d1bfef117862a2889dec4dac227068544c9 Author: Arjan van de Ven arjan@linux.intel.com Date: Mon Jul 19 02:01:06 2010 +0200
PM / Runtime: Add runtime PM statistics (v3)
However, was the patch that was inlined below even compiled or tested? It appears the patch has some unwanted remnants from when the conflicts of the cherry-pick were being fixed up (see inlined comment below). Yong, care to fix this up and re-send. Care to also provide your SOB for all three patches?
Thanks, Leann
Regards, Amit
On Thu, Sep 9, 2010 at 2:20 PM, yong.shen@linaro.org wrote:
From: Arjan van de Ven arjan@linux.intel.com
In order for PowerTOP to be able to report how well the new runtime PM is working for the various drivers, the kernel needs to export some basic statistics in sysfs.
This patch adds two sysfs files in the runtime PM domain that expose the total time a device has been active, and the time a device has been suspended.
With this PowerTOP can compute the activity percentage
Active %age = 100 * (delta active) / (delta active + delta suspended)
and present the information to the user.
I've written the PowerTOP code (slated for version 1.12) already, and the output looks like this:
Runtime Device Power Management statistics Active Device name 10.0% 06:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd. RTL8101E/RTL8102E PCI Express Fast Ethernet controller
[version 2: fix stat update bugs noticed by Alan Stern] [version 3: rebase to -next and move the sysfs declaration]
Signed-off-by: Arjan van de Ven arjan@linux.intel.com Signed-off-by: Rafael J. Wysocki rjw@sisk.pl
drivers/base/power/runtime.c | 54 ++++++++++++++++++++++++++++++---- drivers/base/power/sysfs.c | 65 +++++++++++++++++++++++++++++++++++++++++- include/linux/pm.h | 6 ++++ 3 files changed, 117 insertions(+), 8 deletions(-)
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index b0ec0e9..b78c401 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -123,6 +123,45 @@ int pm_runtime_idle(struct device *dev) } EXPORT_SYMBOL_GPL(pm_runtime_idle);
+/**
- update_pm_runtime_accounting - Update the time accounting of power states
- @dev: Device to update the accounting for
- In order to be able to have time accounting of the various power states
- (as used by programs such as PowerTOP to show the effectiveness of runtime
- PM), we need to track the time spent in each state.
- update_pm_runtime_accounting must be called each time before the
- runtime_status field is updated, to account the time in the old state
- correctly.
- */
+void update_pm_runtime_accounting(struct device *dev) +{
unsigned long now = jiffies;
int delta;
delta = now - dev->power.accounting_timestamp;
if (delta < 0)
delta = 0;
dev->power.accounting_timestamp = now;
if (dev->power.disable_depth > 0)
return;
if (dev->power.runtime_status == RPM_SUSPENDED)
dev->power.suspended_jiffies += delta;
else
dev->power.active_jiffies += delta;
+}
+static void __update_runtime_status(struct device *dev, enum rpm_status status) +{
update_pm_runtime_accounting(dev);
dev->power.runtime_status = status;
+}
/**
- __pm_runtime_suspend - Carry out run-time suspend of given device.
- @dev: Device to suspend.
@@ -197,7 +236,7 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq) goto repeat; }
dev->power.runtime_status = RPM_SUSPENDING;
__update_runtime_status(dev, RPM_SUSPENDING); dev->power.deferred_resume = false; if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_suspend) {
@@ -228,7 +267,7 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq) }
if (retval) {
dev->power.runtime_status = RPM_ACTIVE;
__update_runtime_status(dev, RPM_ACTIVE); if (retval == -EAGAIN || retval == -EBUSY) { if (dev->power.timer_expires == 0) notify = true;
@@ -237,7 +276,7 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq) pm_runtime_cancel_pending(dev); } } else {
dev->power.runtime_status = RPM_SUSPENDED;
__update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_deactivate_timer(dev); if (dev->parent) {
@@ -381,7 +420,7 @@ int __pm_runtime_resume(struct device *dev, bool from_wq) goto repeat; }
dev->power.runtime_status = RPM_RESUMING;
__update_runtime_status(dev, RPM_RESUMING); if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_resume) { spin_unlock_irq(&dev->power.lock);
@@ -411,10 +450,10 @@ int __pm_runtime_resume(struct device *dev, bool from_wq) }
if (retval) {
dev->power.runtime_status = RPM_SUSPENDED;
__update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_cancel_pending(dev); } else {
dev->power.runtime_status = RPM_ACTIVE;
__update_runtime_status(dev, RPM_ACTIVE); if (parent) atomic_inc(&parent->power.child_count); }
@@ -848,7 +887,7 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status) }
out_set:
dev->power.runtime_status = status;
__update_runtime_status(dev, status); dev->power.runtime_error = 0;
out: spin_unlock_irqrestore(&dev->power.lock, flags); @@ -1077,6 +1116,7 @@ void pm_runtime_init(struct device *dev) dev->power.request_pending = false; dev->power.request = RPM_REQ_NONE; dev->power.deferred_resume = false;
dev->power.accounting_timestamp = jiffies; INIT_WORK(&dev->power.work, pm_runtime_work); dev->power.timer_expires = 0;
diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c index a4c33bc..7ee064f 100644 --- a/drivers/base/power/sysfs.c +++ b/drivers/base/power/sysfs.c @@ -6,6 +6,7 @@ #include <linux/string.h> #include <linux/pm_runtime.h> #include <asm/atomic.h> +#include <linux/jiffies.h> #include "power.h"
/* @@ -108,6 +109,66 @@ static ssize_t control_store(struct device * dev, struct device_attribute *attr, }
static DEVICE_ATTR(control, 0644, control_show, control_store);
+static ssize_t rtpm_active_time_show(struct device *dev,
struct device_attribute *attr, char *buf)
+{
int ret;
spin_lock_irq(&dev->power.lock);
update_pm_runtime_accounting(dev);
ret = sprintf(buf, "%i\n", jiffies_to_msecs(dev->power.active_jiffies));
spin_unlock_irq(&dev->power.lock);
return ret;
+}
+static DEVICE_ATTR(runtime_active_time, 0444, rtpm_active_time_show, NULL);
+static ssize_t rtpm_suspended_time_show(struct device *dev,
struct device_attribute *attr, char *buf)
+{
int ret;
spin_lock_irq(&dev->power.lock);
update_pm_runtime_accounting(dev);
ret = sprintf(buf, "%i\n",
jiffies_to_msecs(dev->power.suspended_jiffies));
spin_unlock_irq(&dev->power.lock);
return ret;
+}
+static DEVICE_ATTR(runtime_suspended_time, 0444, rtpm_suspended_time_show, NULL);
+static ssize_t rtpm_status_show(struct device *dev,
struct device_attribute *attr, char *buf)
+{
const char *p;
if (dev->power.runtime_error) {
p = "error\n";
} else if (dev->power.disable_depth) {
p = "unsupported\n";
} else {
switch (dev->power.runtime_status) {
case RPM_SUSPENDED:
p = "suspended\n";
break;
case RPM_SUSPENDING:
p = "suspending\n";
break;
case RPM_RESUMING:
p = "resuming\n";
break;
case RPM_ACTIVE:
p = "active\n";
break;
default:
return -EIO;
}
}
return sprintf(buf, p);
+}
+static DEVICE_ATTR(runtime_status, 0444, rtpm_status_show, NULL); +>>>>>>> 8d4b9d1... PM / Runtime: Add runtime PM statistics (v3):drivers/base/power/sysfs.c
Seems the above was accidentally included when fixing up conflicts ...
#endif
static ssize_t @@ -228,6 +289,9 @@ static DEVICE_ATTR(async, 0644, async_show, async_store); static struct attribute * power_attrs[] = { #ifdef CONFIG_PM_RUNTIME &dev_attr_control.attr,
&dev_attr_runtime_status.attr,
&dev_attr_runtime_suspended_time.attr,
&dev_attr_runtime_active_time.attr,
#endif &dev_attr_wakeup.attr, #ifdef CONFIG_PM_ADVANCED_DEBUG @@ -235,7 +299,6 @@ static struct attribute * power_attrs[] = { #ifdef CONFIG_PM_RUNTIME &dev_attr_runtime_usage.attr, &dev_attr_runtime_active_kids.attr,
&dev_attr_runtime_status.attr, &dev_attr_runtime_enabled.attr,
#endif #endif diff --git a/include/linux/pm.h b/include/linux/pm.h index 8e258c7..dca597f 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -476,9 +476,15 @@ struct dev_pm_info { enum rpm_request request; enum rpm_status runtime_status; int runtime_error;
unsigned long active_jiffies;
unsigned long suspended_jiffies;
unsigned long accounting_timestamp;
#endif };
+extern void update_pm_runtime_accounting(struct device *dev);
/*
- The PM_EVENT_ messages are also used by drivers implementing the legacy
- suspend framework, based on the ->suspend() and ->resume() callbacks common
-- 1.6.3.3