The AUX bounce buffer is allocated with API dma_alloc_coherent(), in the low level's architecture code, e.g. for Arm64, it maps the memory with the attribution "Normal non-cacheable"; this can be concluded from the definition for pgprot_dmacoherent() in arch/arm64/include/asm/pgtable.h.
Later when access the AUX bounce buffer, since the memory mapping is non-cacheable, it's low efficiency due to every load instruction must reach out DRAM.
This patch changes to allocate pages with alloc_pages_node(), thus the driver can access the memory with cacheable mapping in the kernel linear virtual address; therefore, because load instructions can fetch data from cache lines rather than always read data from DRAM, the driver can boost memory coping performance. After using the cacheable mapping, the driver uses dma_sync_single_for_cpu() to invalidate cacheline prior to read bounce buffer so can avoid read stale trace data.
By measurement the duration for function tmc_update_etr_buffer() with ftrace function_graph tracer, it shows the performance significant improvement for copying 4MiB data from bounce buffer:
# echo tmc_etr_get_data_flat_buf > set_graph_notrace // avoid noise # echo tmc_update_etr_buffer > set_graph_function # echo function_graph > current_tracer
before:
# CPU DURATION FUNCTION CALLS # | | | | | | | 2) | tmc_update_etr_buffer() { ... 2) # 8148.320 us | }
after:
# CPU DURATION FUNCTION CALLS # | | | | | | | 2) | tmc_update_etr_buffer() { ... 2) # 2463.980 us | }
Signed-off-by: Leo Yan leo.yan@linaro.org ---
Changes from v1: Set "flat_buf->daddr" to 0 when fails to map DMA region; and dropped the unexpected if condition change in tmc_etr_free_flat_buf().
.../hwtracing/coresight/coresight-tmc-etr.c | 56 ++++++++++++++++--- 1 file changed, 49 insertions(+), 7 deletions(-)
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c index acdb59e0e661..888b0f929d33 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c @@ -21,6 +21,7 @@
struct etr_flat_buf { struct device *dev; + struct page *pages; dma_addr_t daddr; void *vaddr; size_t size; @@ -600,6 +601,7 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata, { struct etr_flat_buf *flat_buf; struct device *real_dev = drvdata->csdev->dev.parent; + ssize_t aligned_size;
/* We cannot reuse existing pages for flat buf */ if (pages) @@ -609,11 +611,18 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata, if (!flat_buf) return -ENOMEM;
- flat_buf->vaddr = dma_alloc_coherent(real_dev, etr_buf->size, - &flat_buf->daddr, GFP_KERNEL); - if (!flat_buf->vaddr) { - kfree(flat_buf); - return -ENOMEM; + aligned_size = PAGE_ALIGN(etr_buf->size); + flat_buf->pages = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, + get_order(aligned_size)); + if (!flat_buf->pages) + goto fail_alloc_pages; + + flat_buf->vaddr = page_address(flat_buf->pages); + flat_buf->daddr = dma_map_page(real_dev, flat_buf->pages, 0, + aligned_size, DMA_FROM_DEVICE); + if (dma_mapping_error(real_dev, flat_buf->daddr)) { + flat_buf->daddr = 0; + goto fail_dma_map_page; }
flat_buf->size = etr_buf->size; @@ -622,6 +631,12 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata, etr_buf->mode = ETR_MODE_FLAT; etr_buf->private = flat_buf; return 0; + +fail_dma_map_page: + __free_pages(flat_buf->pages, get_order(aligned_size)); +fail_alloc_pages: + kfree(flat_buf); + return -ENOMEM; }
static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf) @@ -630,15 +645,20 @@ static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
if (flat_buf && flat_buf->daddr) { struct device *real_dev = flat_buf->dev->parent; + ssize_t aligned_size = PAGE_ALIGN(etr_buf->size);
- dma_free_coherent(real_dev, flat_buf->size, - flat_buf->vaddr, flat_buf->daddr); + dma_unmap_page(real_dev, flat_buf->daddr, aligned_size, + DMA_FROM_DEVICE); + __free_pages(flat_buf->pages, get_order(aligned_size)); } kfree(flat_buf); }
static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp) { + struct etr_flat_buf *flat_buf = etr_buf->private; + struct device *real_dev = flat_buf->dev->parent; + /* * Adjust the buffer to point to the beginning of the trace data * and update the available trace data. @@ -648,6 +668,28 @@ static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp) etr_buf->len = etr_buf->size; else etr_buf->len = rwp - rrp; + + if (etr_buf->offset + etr_buf->len > etr_buf->size) { + int len1, len2; + + /* + * If trace data is wrapped around, sync AUX bounce buffer + * for two chunks: "len1" is for the trace date length at + * the tail of bounce buffer, and "len2" is the length from + * the start of the buffer after wrapping around. + */ + len1 = etr_buf->size - etr_buf->offset; + len2 = etr_buf->len - len1; + dma_sync_single_for_cpu(real_dev, + flat_buf->daddr + etr_buf->offset, + len1, DMA_FROM_DEVICE); + dma_sync_single_for_cpu(real_dev, flat_buf->daddr, + len2, DMA_FROM_DEVICE); + } else { + dma_sync_single_for_cpu(real_dev, + flat_buf->daddr + etr_buf->offset, + etr_buf->len, DMA_FROM_DEVICE); + } }
static ssize_t tmc_etr_get_data_flat_buf(struct etr_buf *etr_buf,
Leo,
On 10/07/2021 08:01, Leo Yan wrote:
The AUX bounce buffer is allocated with API dma_alloc_coherent(), in the low level's architecture code, e.g. for Arm64, it maps the memory with the attribution "Normal non-cacheable"; this can be concluded from the definition for pgprot_dmacoherent() in arch/arm64/include/asm/pgtable.h.
Later when access the AUX bounce buffer, since the memory mapping is non-cacheable, it's low efficiency due to every load instruction must reach out DRAM.
This patch changes to allocate pages with alloc_pages_node(), thus the driver can access the memory with cacheable mapping in the kernel linear virtual address; therefore, because load instructions can fetch data from cache lines rather than always read data from DRAM, the driver can boost memory coping performance. After using the cacheable mapping, the driver uses dma_sync_single_for_cpu() to invalidate cacheline prior to read bounce buffer so can avoid read stale trace data.
By measurement the duration for function tmc_update_etr_buffer() with ftrace function_graph tracer, it shows the performance significant improvement for copying 4MiB data from bounce buffer:
# echo tmc_etr_get_data_flat_buf > set_graph_notrace // avoid noise # echo tmc_update_etr_buffer > set_graph_function # echo function_graph > current_tracer
before:
# CPU DURATION FUNCTION CALLS # | | | | | | | 2) | tmc_update_etr_buffer() { ... 2) # 8148.320 us | }
after:
# CPU DURATION FUNCTION CALLS # | | | | | | | 2) | tmc_update_etr_buffer() { ... 2) # 2463.980 us | }
Thats a good speed up ! Thanks for the patch. One minor comment below.
Signed-off-by: Leo Yan leo.yan@linaro.org
Changes from v1: Set "flat_buf->daddr" to 0 when fails to map DMA region; and dropped the unexpected if condition change in tmc_etr_free_flat_buf().
.../hwtracing/coresight/coresight-tmc-etr.c | 56 ++++++++++++++++--- 1 file changed, 49 insertions(+), 7 deletions(-)
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c index acdb59e0e661..888b0f929d33 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c @@ -21,6 +21,7 @@ struct etr_flat_buf { struct device *dev;
- struct page *pages; dma_addr_t daddr; void *vaddr; size_t size;
@@ -600,6 +601,7 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata, { struct etr_flat_buf *flat_buf; struct device *real_dev = drvdata->csdev->dev.parent;
- ssize_t aligned_size;
/* We cannot reuse existing pages for flat buf */ if (pages) @@ -609,11 +611,18 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata, if (!flat_buf) return -ENOMEM;
- flat_buf->vaddr = dma_alloc_coherent(real_dev, etr_buf->size,
&flat_buf->daddr, GFP_KERNEL);
- if (!flat_buf->vaddr) {
kfree(flat_buf);
return -ENOMEM;
- aligned_size = PAGE_ALIGN(etr_buf->size);
- flat_buf->pages = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO,
get_order(aligned_size));
- if (!flat_buf->pages)
goto fail_alloc_pages;
- flat_buf->vaddr = page_address(flat_buf->pages);
- flat_buf->daddr = dma_map_page(real_dev, flat_buf->pages, 0,
aligned_size, DMA_FROM_DEVICE);
- if (dma_mapping_error(real_dev, flat_buf->daddr)) {
flat_buf->daddr = 0;
}goto fail_dma_map_page;
flat_buf->size = etr_buf->size; @@ -622,6 +631,12 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata, etr_buf->mode = ETR_MODE_FLAT; etr_buf->private = flat_buf; return 0;
+fail_dma_map_page:
- __free_pages(flat_buf->pages, get_order(aligned_size));
+fail_alloc_pages:
- kfree(flat_buf);
- return -ENOMEM; }
static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf) @@ -630,15 +645,20 @@ static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf) if (flat_buf && flat_buf->daddr) { struct device *real_dev = flat_buf->dev->parent;
ssize_t aligned_size = PAGE_ALIGN(etr_buf->size);
dma_free_coherent(real_dev, flat_buf->size,
flat_buf->vaddr, flat_buf->daddr);
dma_unmap_page(real_dev, flat_buf->daddr, aligned_size,
DMA_FROM_DEVICE);
} kfree(flat_buf); }__free_pages(flat_buf->pages, get_order(aligned_size));
static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp) {
- struct etr_flat_buf *flat_buf = etr_buf->private;
- struct device *real_dev = flat_buf->dev->parent;
- /*
- Adjust the buffer to point to the beginning of the trace data
- and update the available trace data.
@@ -648,6 +668,28 @@ static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp) etr_buf->len = etr_buf->size; else etr_buf->len = rwp - rrp;
- if (etr_buf->offset + etr_buf->len > etr_buf->size) {
int len1, len2;
/*
* If trace data is wrapped around, sync AUX bounce buffer
* for two chunks: "len1" is for the trace date length at
* the tail of bounce buffer, and "len2" is the length from
* the start of the buffer after wrapping around.
*/
len1 = etr_buf->size - etr_buf->offset;
len2 = etr_buf->len - len1;
dma_sync_single_for_cpu(real_dev,
flat_buf->daddr + etr_buf->offset,
len1, DMA_FROM_DEVICE);
dma_sync_single_for_cpu(real_dev, flat_buf->daddr,
len2, DMA_FROM_DEVICE);
We always start tracing at the beginning of the buffer and the only reason why we would get a wrap around, is when the buffer is full. So you could as well sync the entire buffer in one go
dma_sync_single_for_cpu(real_dev, flat_buf->daddr, etr_buf->len, DMA_FROM_DEVICE);
- } else {
dma_sync_single_for_cpu(real_dev,
flat_buf->daddr + etr_buf->offset,
etr_buf->len, DMA_FROM_DEVICE);
- } }
static ssize_t tmc_etr_get_data_flat_buf(struct etr_buf *etr_buf,
Eitherways,
Reviewed-by: Suzuki K Poulose suzuki.poulose@arm.com
Hi Suzuki,
On Mon, Jul 12, 2021 at 10:55:32AM +0100, Suzuki Kuruppassery Poulose wrote:
[...]
static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp) {
- struct etr_flat_buf *flat_buf = etr_buf->private;
- struct device *real_dev = flat_buf->dev->parent;
- /*
- Adjust the buffer to point to the beginning of the trace data
- and update the available trace data.
@@ -648,6 +668,28 @@ static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp) etr_buf->len = etr_buf->size; else etr_buf->len = rwp - rrp;
- if (etr_buf->offset + etr_buf->len > etr_buf->size) {
int len1, len2;
/*
* If trace data is wrapped around, sync AUX bounce buffer
* for two chunks: "len1" is for the trace date length at
* the tail of bounce buffer, and "len2" is the length from
* the start of the buffer after wrapping around.
*/
len1 = etr_buf->size - etr_buf->offset;
len2 = etr_buf->len - len1;
dma_sync_single_for_cpu(real_dev,
flat_buf->daddr + etr_buf->offset,
len1, DMA_FROM_DEVICE);
dma_sync_single_for_cpu(real_dev, flat_buf->daddr,
len2, DMA_FROM_DEVICE);
We always start tracing at the beginning of the buffer and the only reason why we would get a wrap around, is when the buffer is full. So you could as well sync the entire buffer in one go
dma_sync_single_for_cpu(real_dev, flat_buf->daddr, etr_buf->len, DMA_FROM_DEVICE);
I am doubt why you conclude "always start tracing at the beginning of the buffer"? I read the driver but cannot find any code in the driver to reset rrp and rwp after fetching the trace data, or there have any implict operation to reset pointers?
Just want to double check for this, in case I miss anything. Thanks for the review.
Leo
On 12/07/2021 12:09, Leo Yan wrote:
Hi Suzuki,
On Mon, Jul 12, 2021 at 10:55:32AM +0100, Suzuki Kuruppassery Poulose wrote:
[...]
static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp) {
- struct etr_flat_buf *flat_buf = etr_buf->private;
- struct device *real_dev = flat_buf->dev->parent;
- /*
- Adjust the buffer to point to the beginning of the trace data
- and update the available trace data.
@@ -648,6 +668,28 @@ static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp) etr_buf->len = etr_buf->size; else etr_buf->len = rwp - rrp;
- if (etr_buf->offset + etr_buf->len > etr_buf->size) {
int len1, len2;
/*
* If trace data is wrapped around, sync AUX bounce buffer
* for two chunks: "len1" is for the trace date length at
* the tail of bounce buffer, and "len2" is the length from
* the start of the buffer after wrapping around.
*/
len1 = etr_buf->size - etr_buf->offset;
len2 = etr_buf->len - len1;
dma_sync_single_for_cpu(real_dev,
flat_buf->daddr + etr_buf->offset,
len1, DMA_FROM_DEVICE);
dma_sync_single_for_cpu(real_dev, flat_buf->daddr,
len2, DMA_FROM_DEVICE);
We always start tracing at the beginning of the buffer and the only reason why we would get a wrap around, is when the buffer is full. So you could as well sync the entire buffer in one go
dma_sync_single_for_cpu(real_dev, flat_buf->daddr, etr_buf->len, DMA_FROM_DEVICE);
I am doubt why you conclude "always start tracing at the beginning of the buffer"? I read the driver but cannot find any code in the driver to reset rrp and rwp after fetching the trace data, or there have any implict operation to reset pointers?
The ETR is always programmed with the base address of the "ETR" buffer, which is *not the same* as the perf ring buffer, since we always do double buffering. We do not program the RRP/RWP of the ETR (except for the SoC-600, where it is mandatory and we set them to the base address). Thus there is no context associated with the ETR buffer. But at the end of the run, we do read the RRP/ RWP to figure out where the ETR has reached.
As for reseting the RRP / RWP, at the beginning of a session, is done implicitly for the ETR (except for SoC-600 ETRs as explained above) by the hardware to the base address.
Suzuki
On Mon, Jul 12, 2021 at 12:17:04PM +0100, Suzuki Kuruppassery Poulose wrote:
[...]
I am doubt why you conclude "always start tracing at the beginning of the buffer"? I read the driver but cannot find any code in the driver to reset rrp and rwp after fetching the trace data, or there have any implict operation to reset pointers?
The ETR is always programmed with the base address of the "ETR" buffer, which is *not the same* as the perf ring buffer, since we always do double buffering. We do not program the RRP/RWP of the ETR (except for the SoC-600, where it is mandatory and we set them to the base address). Thus there is no context associated with the ETR buffer. But at the end of the run, we do read the RRP/ RWP to figure out where the ETR has reached.
As for reseting the RRP / RWP, at the beginning of a session, is done implicitly for the ETR (except for SoC-600 ETRs as explained above) by the hardware to the base address.
Yes, I finally matched your description with the code. Will respin patch for this.
Thanks for confirmation! Leo