This series introduces changes to reduce the time required to send a frequency transition requests to the platform while using the cppc-cpufreq driver.
With these changes we see significant improvement in the average time to send freq. transition request to the platform. Profiling on an ARM platform showed that the average transaction time per request reduced from 200us to under 20us.
Ashwin Chaugule (1): ACPI / CPPC: Optimize PCC Read Write operations
Prashanth Prakash (3): acpi: cppc: optimized cpc_read and cpc_write mailbox: pcc: optimized pcc_send_data acpi: cppc: replace writeX/readX to PCC with relaxed version
drivers/acpi/cppc_acpi.c | 184 +++++++++++++++++++++++++++++++++++------------ drivers/mailbox/pcc.c | 110 ++++++++++++++++++++++++++-- 2 files changed, 241 insertions(+), 53 deletions(-)
From: Ashwin Chaugule ashwin.chaugule@linaro.org
Previously the send_pcc_cmd() code checked if the PCC operation had completed before returning from the function. This check was performed regardless of the PCC op type (i.e. Read/Write). Knowing the type of cmd can be used to optimize the check and avoid needless waiting. e.g. with Write ops, the actual Writing is done before calling send_pcc_cmd(). And the subsequent Writes will check if the channel is free at the entry of send_pcc_cmd() anyway.
However, for Read cmds, we need to wait for the cmd completion bit to be flipped, since the actual Read ops follow after returning from the send_pcc_cmd(). So, only do the looping check at the end for Read ops.
Also, instead of using udelay() calls, use ktime as a means to check for deadlines. The current deadline in which the Remote should flip the cmd completion bit is defined as N * Nominal latency. Where N is arbitrary and large enough to work on slow emulators and Nominal latency comes from the ACPI table (PCCT). This helps in working around the CONFIG_HZ effects on udelay() and also avoids needing different ACPI tables for Silicon and Emulation platforms.
Signed-off-by: Ashwin Chaugule ashwin.chaugule@linaro.org Signed-off-by: Prashanth Prakash pprakash@codeaurora.org --- drivers/acpi/cppc_acpi.c | 102 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 74 insertions(+), 28 deletions(-)
diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 6730f96..36c3e4d 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -39,6 +39,7 @@
#include <linux/cpufreq.h> #include <linux/delay.h> +#include <linux/ktime.h>
#include <acpi/cppc_acpi.h> /* @@ -63,25 +64,56 @@ static struct mbox_chan *pcc_channel; static void __iomem *pcc_comm_addr; static u64 comm_base_addr; static int pcc_subspace_idx = -1; -static u16 pcc_cmd_delay; static bool pcc_channel_acquired; +static ktime_t deadline;
/* * Arbitrary Retries in case the remote processor is slow to respond - * to PCC commands. + * to PCC commands. Keeping it high enough to cover emulators where + * the processors run painfully slow. */ #define NUM_RETRIES 500
+static int check_pcc_chan(void) +{ + int result = -EIO; + struct acpi_pcct_shared_memory *generic_comm_base = + (struct acpi_pcct_shared_memory *) pcc_comm_addr; + ktime_t next_deadline = ktime_add(ktime_get(), deadline); + int retries = 0; + + /* Retry in case the remote processor was too slow to catch up. */ + while (!ktime_after(ktime_get(), next_deadline)) { + if (readw_relaxed(&generic_comm_base->status) & PCC_CMD_COMPLETE) { + result = 0; + break; + } + /* + * Reducing the bus traffic in case this loop takes longer than + * a few retries. + */ + udelay(3); + retries++; + } + + return result; +} + static int send_pcc_cmd(u16 cmd) { - int retries, result = -EIO; - struct acpi_pcct_hw_reduced *pcct_ss = pcc_channel->con_priv; + int ret = -EIO; struct acpi_pcct_shared_memory *generic_comm_base = (struct acpi_pcct_shared_memory *) pcc_comm_addr; - u32 cmd_latency = pcct_ss->latency;
- /* Min time OS should wait before sending next command. */ - udelay(pcc_cmd_delay); + /* + * For CMD_WRITE we know for a fact the caller should have checked + * the channel before writing to PCC space + */ + if (cmd == CMD_READ) { + ret = check_pcc_chan(); + if (ret) + return ret; + }
/* Write to the shared comm region. */ writew(cmd, &generic_comm_base->command); @@ -90,26 +122,25 @@ static int send_pcc_cmd(u16 cmd) writew(0, &generic_comm_base->status);
/* Ring doorbell */ - result = mbox_send_message(pcc_channel, &cmd); - if (result < 0) { + ret = mbox_send_message(pcc_channel, &cmd); + if (ret < 0) { pr_err("Err sending PCC mbox message. cmd:%d, ret:%d\n", - cmd, result); - return result; + cmd, ret); + return ret; }
- /* Wait for a nominal time to let platform process command. */ - udelay(cmd_latency); - - /* Retry in case the remote processor was too slow to catch up. */ - for (retries = NUM_RETRIES; retries > 0; retries--) { - if (readw_relaxed(&generic_comm_base->status) & PCC_CMD_COMPLETE) { - result = 0; - break; - } - } + /* + * For READs we need to ensure the cmd completed to ensure + * the ensuing read()s can proceed. For WRITEs we dont care + * because the actual write()s are done before coming here + * and the next READ or WRITE will check if the channel + * is busy/free at the entry of this call. + */ + if (cmd == CMD_READ) + ret = check_pcc_chan();
- mbox_client_txdone(pcc_channel, result); - return result; + mbox_client_txdone(pcc_channel, ret); + return ret; }
static void cppc_chan_tx_done(struct mbox_client *cl, void *msg, int ret) @@ -306,6 +337,7 @@ static int register_pcc_channel(int pcc_subspace_idx) { struct acpi_pcct_hw_reduced *cppc_ss; unsigned int len; + u64 usecs_lat;
if (pcc_subspace_idx >= 0) { pcc_channel = pcc_mbox_request_channel(&cppc_mbox_cl, @@ -335,7 +367,14 @@ static int register_pcc_channel(int pcc_subspace_idx) */ comm_base_addr = cppc_ss->base_address; len = cppc_ss->length; - pcc_cmd_delay = cppc_ss->min_turnaround_time; + + /* + * cppc_ss->latency is just a Nominal value. In reality + * the remote processor could be much slower to reply. + * So add an arbitrary amount of wait on top of Nominal. + */ + usecs_lat = NUM_RETRIES * cppc_ss->latency; + deadline = ns_to_ktime(usecs_lat * NSEC_PER_USEC);
pcc_comm_addr = acpi_os_ioremap(comm_base_addr, len); if (!pcc_comm_addr) { @@ -604,7 +643,7 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) (ref_perf->cpc_entry.reg.space_id == ACPI_ADR_SPACE_PLATFORM_COMM) || (nom_perf->cpc_entry.reg.space_id == ACPI_ADR_SPACE_PLATFORM_COMM)) { /* Ring doorbell once to update PCC subspace */ - if (send_pcc_cmd(CMD_READ)) { + if (send_pcc_cmd(CMD_READ) < 0) { ret = -EIO; goto out_err; } @@ -662,7 +701,7 @@ int cppc_get_perf_ctrs(int cpunum, struct cppc_perf_fb_ctrs *perf_fb_ctrs) if ((delivered_reg->cpc_entry.reg.space_id == ACPI_ADR_SPACE_PLATFORM_COMM) || (reference_reg->cpc_entry.reg.space_id == ACPI_ADR_SPACE_PLATFORM_COMM)) { /* Ring doorbell once to update PCC subspace */ - if (send_pcc_cmd(CMD_READ)) { + if (send_pcc_cmd(CMD_READ) < 0) { ret = -EIO; goto out_err; } @@ -713,6 +752,13 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
spin_lock(&pcc_lock);
+ /* If this is PCC reg, check if channel is free before writing */ + if (desired_reg->cpc_entry.reg.space_id == ACPI_ADR_SPACE_PLATFORM_COMM) { + ret = check_pcc_chan(); + if (ret) + goto busy_channel; + } + /* * Skip writing MIN/MAX until Linux knows how to come up with * useful values. @@ -722,10 +768,10 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) /* Is this a PCC reg ?*/ if (desired_reg->cpc_entry.reg.space_id == ACPI_ADR_SPACE_PLATFORM_COMM) { /* Ring doorbell so Remote can get our perf request. */ - if (send_pcc_cmd(CMD_WRITE)) + if (send_pcc_cmd(CMD_WRITE) < 0) ret = -EIO; } - +busy_channel: spin_unlock(&pcc_lock);
return ret;
cpc_read and cpc_write are used while holding the pcc_lock spin_lock, so they need to be as fast as possible. acpi_os_read/write_memory APIs linearly search through a list for cached mapping which is quite expensive. Since the PCC subspace is already mapped into virtual address space during initialization, we can just add the offset and access the necessary CPPC registers.
This patch + similar changes to PCC driver reduce the time per freq. transition from around 200us to about 20us for cppc cpufreq driver
Signed-off-by: Prashanth Prakash pprakash@codeaurora.org --- drivers/acpi/cppc_acpi.c | 80 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 64 insertions(+), 16 deletions(-)
diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 36c3e4d..b85759d 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -67,6 +67,9 @@ static int pcc_subspace_idx = -1; static bool pcc_channel_acquired; static ktime_t deadline;
+/* pcc mapped address + header size + offset within PCC subspace */ +#define GET_PCC_VADDR(offs) (pcc_comm_addr + 0x8 + (offs)) + /* * Arbitrary Retries in case the remote processor is slow to respond * to PCC commands. Keeping it high enough to cover emulators where @@ -585,29 +588,74 @@ void acpi_cppc_processor_exit(struct acpi_processor *pr) } EXPORT_SYMBOL_GPL(acpi_cppc_processor_exit);
-static u64 get_phys_addr(struct cpc_reg *reg) -{ - /* PCC communication addr space begins at byte offset 0x8. */ - if (reg->space_id == ACPI_ADR_SPACE_PLATFORM_COMM) - return (u64)comm_base_addr + 0x8 + reg->address; - else - return reg->address; -} +/* + * Since cpc_read and cpc_write are called while holding pcc_lock, it should be + * as fast as possible. We have already mapped the PCC subspace during init, so + * we can directly write to it. + */
-static void cpc_read(struct cpc_reg *reg, u64 *val) +static int cpc_read(struct cpc_reg *reg, u64 *val) { - u64 addr = get_phys_addr(reg); + int ret_val = 0; + + *val = 0; + if (reg->space_id == ACPI_ADR_SPACE_PLATFORM_COMM) { + void *vaddr = GET_PCC_VADDR(reg->address);
- acpi_os_read_memory((acpi_physical_address)addr, - val, reg->bit_width); + switch (reg->bit_width) { + case 8: + *val = readb(vaddr); + break; + case 16: + *val = readw(vaddr); + break; + case 32: + *val = readl(vaddr); + break; + case 64: + *val = readq(vaddr); + break; + default: + pr_debug("Error: Cannot read %u bit width from PCC\n", + reg->bit_width); + ret_val = -EFAULT; + } + } else + ret_val = acpi_os_read_memory((acpi_physical_address)reg->address, + val, reg->bit_width); + return ret_val; }
-static void cpc_write(struct cpc_reg *reg, u64 val) +static int cpc_write(struct cpc_reg *reg, u64 val) { - u64 addr = get_phys_addr(reg); + int ret_val = 0; + + if (reg->space_id == ACPI_ADR_SPACE_PLATFORM_COMM) { + void *vaddr = GET_PCC_VADDR(reg->address);
- acpi_os_write_memory((acpi_physical_address)addr, - val, reg->bit_width); + switch (reg->bit_width) { + case 8: + writeb(val, vaddr); + break; + case 16: + writew(val, vaddr); + break; + case 32: + writel(val, vaddr); + break; + case 64: + writeq(val, vaddr); + break; + default: + pr_debug("Error: Cannot write %u bit width to PCC\n", + reg->bit_width); + ret_val = -EFAULT; + break; + } + } else + ret_val = acpi_os_write_memory((acpi_physical_address)reg->address, + val, reg->bit_width); + return ret_val; }
/**
On 15 January 2016 at 13:43, Prashanth Prakash pprakash@codeaurora.org wrote:
cpc_read and cpc_write are used while holding the pcc_lock spin_lock, so they need to be as fast as possible. acpi_os_read/write_memory APIs linearly search through a list for cached mapping which is quite expensive. Since the PCC subspace is already mapped into virtual address space during initialization, we can just add the offset and access the necessary CPPC registers.
This patch + similar changes to PCC driver reduce the time per freq. transition from around 200us to about 20us for cppc cpufreq driver
Signed-off-by: Prashanth Prakash pprakash@codeaurora.org
drivers/acpi/cppc_acpi.c | 80 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 64 insertions(+), 16 deletions(-)
Acked-by: Ashwin Chaugule ashwin.chaugule@linaro.org
pcc_send_data can be invoked during the execution of performance critical code as in cppc_cpufreq driver. With acpi_* APIs, the doorbell register accessed in pcc_send_data if present in system memory will be searched (in cached virt to phys addr mapping), mapped, read/written and then unmapped. These operations take significant amount of time.
This patch maps the performance critical doorbell register during init and then reads/writes to it directly using the mapped virtual address. This patch + similar changes to CPPC acpi driver reduce the time per freq. transition from around 200us to about 20us for cppc cpufreq driver
Signed-off-by: Prashanth Prakash pprakash@codeaurora.org --- drivers/mailbox/pcc.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 103 insertions(+), 7 deletions(-)
diff --git a/drivers/mailbox/pcc.c b/drivers/mailbox/pcc.c index 45d85ae..150e18e 100644 --- a/drivers/mailbox/pcc.c +++ b/drivers/mailbox/pcc.c @@ -70,6 +70,9 @@
static struct mbox_chan *pcc_mbox_channels;
+/*Array of cached virtual address for doorbell registers*/ +static void **pcc_doorbell_vaddr; + static struct mbox_controller pcc_mbox_ctrl = {}; /** * get_pcc_channel - Given a PCC subspace idx, get @@ -166,6 +169,66 @@ void pcc_mbox_free_channel(struct mbox_chan *chan) } EXPORT_SYMBOL_GPL(pcc_mbox_free_channel);
+/* + * PCC can be used with perf critical drivers such as CPPC + * So it makes sense to locally cache the virtual address and + * use it to read/write to PCC registers such as doorbell register + * + * The below read_register and write_registers are used to read and + * write from perf critical registers such as PCC doorbell register + */ +static int read_register(void *vaddr, u64 *val, unsigned int bit_width) +{ + int ret_val = 0; + + switch (bit_width) { + case 8: + *val = readb(vaddr); + break; + case 16: + *val = readw(vaddr); + break; + case 32: + *val = readl(vaddr); + break; + case 64: + *val = readq(vaddr); + break; + default: + pr_debug("Error: Cannot read register of %u bit width", + bit_width); + ret_val = -EFAULT; + break; + } + return ret_val; +} + +static int write_register(void *vaddr, u64 val, unsigned int bit_width) +{ + int ret_val = 0; + + switch (bit_width) { + case 8: + writeb(val, vaddr); + break; + case 16: + writew(val, vaddr); + break; + case 32: + writel(val, vaddr); + break; + case 64: + writeq(val, vaddr); + break; + default: + pr_debug("Error: Cannot write register of %u bit width", + bit_width); + ret_val = -EFAULT; + break; + } + return ret_val; +} + /** * pcc_send_data - Called from Mailbox Controller code. Used * here only to ring the channel doorbell. The PCC client @@ -181,21 +244,39 @@ EXPORT_SYMBOL_GPL(pcc_mbox_free_channel); static int pcc_send_data(struct mbox_chan *chan, void *data) { struct acpi_pcct_hw_reduced *pcct_ss = chan->con_priv; - struct acpi_generic_address doorbell; + struct acpi_generic_address *doorbell; u64 doorbell_preserve; u64 doorbell_val; u64 doorbell_write; + u32 id = chan - pcc_mbox_channels; + int ret = 0; + + if (id >= pcc_mbox_ctrl.num_chans) { + pr_debug("pcc_send_data: Invalid mbox_chan passed\n"); + return -ENOENT; + }
- doorbell = pcct_ss->doorbell_register; + doorbell = &pcct_ss->doorbell_register; doorbell_preserve = pcct_ss->preserve_mask; doorbell_write = pcct_ss->write_mask;
/* Sync notification from OS to Platform. */ - acpi_read(&doorbell_val, &doorbell); - acpi_write((doorbell_val & doorbell_preserve) | doorbell_write, - &doorbell); - - return 0; + if (pcc_doorbell_vaddr[id]) { + ret = read_register(pcc_doorbell_vaddr[id], &doorbell_val, + doorbell->bit_width); + if (ret) + return ret; + ret = write_register(pcc_doorbell_vaddr[id], + (doorbell_val & doorbell_preserve) | doorbell_write, + doorbell->bit_width); + } else { + ret = acpi_read(&doorbell_val, doorbell); + if (ret) + return ret; + ret = acpi_write((doorbell_val & doorbell_preserve) | doorbell_write, + doorbell); + } + return ret; }
static const struct mbox_chan_ops pcc_chan_ops = { @@ -271,14 +352,29 @@ static int __init acpi_pcc_probe(void) return -ENOMEM; }
+ pcc_doorbell_vaddr = kcalloc(count, sizeof(void *), GFP_KERNEL); + if (!pcc_doorbell_vaddr) { + kfree(pcc_mbox_channels); + return -ENOMEM; + } + /* Point to the first PCC subspace entry */ pcct_entry = (struct acpi_subtable_header *) ( (unsigned long) pcct_tbl + sizeof(struct acpi_table_pcct));
for (i = 0; i < count; i++) { + struct acpi_generic_address *db_reg; + struct acpi_pcct_hw_reduced *pcct_ss; pcc_mbox_channels[i].con_priv = pcct_entry; pcct_entry = (struct acpi_subtable_header *) ((unsigned long) pcct_entry + pcct_entry->length); + + /* If doorbell is in system memory cache the virt address */ + pcct_ss = (struct acpi_pcct_hw_reduced *)pcct_entry; + db_reg = &pcct_ss->doorbell_register; + if (db_reg->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) + pcc_doorbell_vaddr[i] = acpi_os_ioremap(db_reg->address, + db_reg->bit_width/8); }
pcc_mbox_ctrl.num_chans = count;
Hi Prashanth,
[auto build test ERROR on pm/linux-next] [also build test ERROR on v4.4 next-20160115] [if your patch is applied to the wrong git tree, please drop us a note to help improving the system]
url: https://github.com/0day-ci/linux/commits/Prashanth-Prakash/acpi-cppc-optimiz... base: https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git linux-next config: i386-allmodconfig (attached as .config) reproduce: # save the attached .config to linux build tree make ARCH=i386
All errors (new ones prefixed by >>):
drivers/mailbox/pcc.c: In function 'read_register':
drivers/mailbox/pcc.c:195:10: error: implicit declaration of function 'readq' [-Werror=implicit-function-declaration]
*val = readq(vaddr); ^ drivers/mailbox/pcc.c: In function 'write_register':
drivers/mailbox/pcc.c:221:3: error: implicit declaration of function 'writeq' [-Werror=implicit-function-declaration]
writeq(val, vaddr); ^ cc1: some warnings being treated as errors
vim +/readq +195 drivers/mailbox/pcc.c
189 *val = readw(vaddr); 190 break; 191 case 32: 192 *val = readl(vaddr); 193 break; 194 case 64:
195 *val = readq(vaddr);
196 break; 197 default: 198 pr_debug("Error: Cannot read register of %u bit width", 199 bit_width); 200 ret_val = -EFAULT; 201 break; 202 } 203 return ret_val; 204 } 205 206 static int write_register(void *vaddr, u64 val, unsigned int bit_width) 207 { 208 int ret_val = 0; 209 210 switch (bit_width) { 211 case 8: 212 writeb(val, vaddr); 213 break; 214 case 16: 215 writew(val, vaddr); 216 break; 217 case 32: 218 writel(val, vaddr); 219 break; 220 case 64:
221 writeq(val, vaddr);
222 break; 223 default: 224 pr_debug("Error: Cannot write register of %u bit width",
--- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
On 15 January 2016 at 14:33, kbuild test robot lkp@intel.com wrote:
Hi Prashanth,
[auto build test ERROR on pm/linux-next] [also build test ERROR on v4.4 next-20160115] [if your patch is applied to the wrong git tree, please drop us a note to help improving the system]
url: https://github.com/0day-ci/linux/commits/Prashanth-Prakash/acpi-cppc-optimiz... base: https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git linux-next config: i386-allmodconfig (attached as .config) reproduce: # save the attached .config to linux build tree make ARCH=i386
All errors (new ones prefixed by >>):
drivers/mailbox/pcc.c: In function 'read_register':
drivers/mailbox/pcc.c:195:10: error: implicit declaration of function 'readq' [-Werror=implicit-function-declaration]
*val = readq(vaddr); ^
drivers/mailbox/pcc.c: In function 'write_register':
drivers/mailbox/pcc.c:221:3: error: implicit declaration of function 'writeq' [-Werror=implicit-function-declaration]
writeq(val, vaddr); ^
cc1: some warnings being treated as errors
You probably need to include asm-generic/io.h
Thanks, Ashwin.
vim +/readq +195 drivers/mailbox/pcc.c
189 *val = readw(vaddr); 190 break; 191 case 32: 192 *val = readl(vaddr); 193 break; 194 case 64:
195 *val = readq(vaddr);
196 break; 197 default: 198 pr_debug("Error: Cannot read register of %u bit width", 199 bit_width); 200 ret_val = -EFAULT; 201 break; 202 } 203 return ret_val; 204 } 205 206 static int write_register(void *vaddr, u64 val, unsigned int bit_width) 207 { 208 int ret_val = 0; 209 210 switch (bit_width) { 211 case 8: 212 writeb(val, vaddr); 213 break; 214 case 16: 215 writew(val, vaddr); 216 break; 217 case 32: 218 writel(val, vaddr); 219 break; 220 case 64:
221 writeq(val, vaddr);
222 break; 223 default: 224 pr_debug("Error: Cannot write register of %u bit width",
0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
On 1/20/2016 3:29 PM, Ashwin Chaugule wrote:
drivers/mailbox/pcc.c:221:3: error: implicit declaration of function 'writeq' [-Werror=implicit-function-declaration]
writeq(val, vaddr); ^
cc1: some warnings being treated as errors
You probably need to include asm-generic/io.h
Thanks! I will update, test and re-post the patch set.
-Prashanth
We do not have a strict read/write order requirement while accessing PCC subspace. The only requirement is all access should be committed before triggering the PCC doorbell to transfer the ownership of PCC to the platform and this requirement is enforced by the PCC driver.
Profiling on a many core system shows improvement of about 1.8us on average per freq change request(about 10% improvement on average). Since these operations are executed while holding the pcc_lock, reducing this time helps the CPPC implementation to scale much better as the number of cores increases.
Signed-off-by: Prashanth Prakash pprakash@codeaurora.org --- drivers/acpi/cppc_acpi.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index b85759d..87936d7 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -119,10 +119,10 @@ static int send_pcc_cmd(u16 cmd) }
/* Write to the shared comm region. */ - writew(cmd, &generic_comm_base->command); + writew_relaxed(cmd, &generic_comm_base->command);
/* Flip CMD COMPLETE bit */ - writew(0, &generic_comm_base->status); + writew_relaxed(0, &generic_comm_base->status);
/* Ring doorbell */ ret = mbox_send_message(pcc_channel, &cmd); @@ -604,16 +604,16 @@ static int cpc_read(struct cpc_reg *reg, u64 *val)
switch (reg->bit_width) { case 8: - *val = readb(vaddr); + *val = readb_relaxed(vaddr); break; case 16: - *val = readw(vaddr); + *val = readw_relaxed(vaddr); break; case 32: - *val = readl(vaddr); + *val = readl_relaxed(vaddr); break; case 64: - *val = readq(vaddr); + *val = readq_relaxed(vaddr); break; default: pr_debug("Error: Cannot read %u bit width from PCC\n", @@ -635,16 +635,16 @@ static int cpc_write(struct cpc_reg *reg, u64 val)
switch (reg->bit_width) { case 8: - writeb(val, vaddr); + writeb_relaxed(val, vaddr); break; case 16: - writew(val, vaddr); + writew_relaxed(val, vaddr); break; case 32: - writel(val, vaddr); + writel_relaxed(val, vaddr); break; case 64: - writeq(val, vaddr); + writeq_relaxed(val, vaddr); break; default: pr_debug("Error: Cannot write %u bit width to PCC\n",
On 15 January 2016 at 13:43, Prashanth Prakash pprakash@codeaurora.org wrote:
We do not have a strict read/write order requirement while accessing PCC subspace. The only requirement is all access should be committed before triggering the PCC doorbell to transfer the ownership of PCC to the platform and this requirement is enforced by the PCC driver.
Profiling on a many core system shows improvement of about 1.8us on average per freq change request(about 10% improvement on average). Since these operations are executed while holding the pcc_lock, reducing this time helps the CPPC implementation to scale much better as the number of cores increases.
Signed-off-by: Prashanth Prakash pprakash@codeaurora.org
drivers/acpi/cppc_acpi.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-)
Acked-by: Ashwin Chaugule ashwin.chaugule@linaro.org
diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index b85759d..87936d7 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -119,10 +119,10 @@ static int send_pcc_cmd(u16 cmd) }
/* Write to the shared comm region. */
writew(cmd, &generic_comm_base->command);
writew_relaxed(cmd, &generic_comm_base->command); /* Flip CMD COMPLETE bit */
writew(0, &generic_comm_base->status);
writew_relaxed(0, &generic_comm_base->status); /* Ring doorbell */ ret = mbox_send_message(pcc_channel, &cmd);
@@ -604,16 +604,16 @@ static int cpc_read(struct cpc_reg *reg, u64 *val)
switch (reg->bit_width) { case 8:
*val = readb(vaddr);
*val = readb_relaxed(vaddr); break; case 16:
*val = readw(vaddr);
*val = readw_relaxed(vaddr); break; case 32:
*val = readl(vaddr);
*val = readl_relaxed(vaddr); break; case 64:
*val = readq(vaddr);
*val = readq_relaxed(vaddr); break; default: pr_debug("Error: Cannot read %u bit width from PCC\n",
@@ -635,16 +635,16 @@ static int cpc_write(struct cpc_reg *reg, u64 val)
switch (reg->bit_width) { case 8:
writeb(val, vaddr);
writeb_relaxed(val, vaddr); break; case 16:
writew(val, vaddr);
writew_relaxed(val, vaddr); break; case 32:
writel(val, vaddr);
writel_relaxed(val, vaddr); break; case 64:
writeq(val, vaddr);
writeq_relaxed(val, vaddr); break; default: pr_debug("Error: Cannot write %u bit width to PCC\n",
-- Qualcomm Technologies, Inc. on behalf of the Qualcomm Innovation Center, Inc. The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.