The AUX bounce buffer is allocated with API dma_alloc_coherent(), in the
low level's architecture code, e.g. for Arm64, it maps the memory with
the attribution "Normal non-cacheable"; this can be concluded from the
definition for pgprot_dmacoherent() in arch/arm64/include/asm/pgtable.h.
Later when access the AUX bounce buffer, since the memory mapping is
non-cacheable, it's low efficiency due to every load instruction must
reach out DRAM.
This patch changes to allocate pages with alloc_pages_node(), thus the
driver can access the memory with cacheable mapping in the kernel linear
virtual address; therefore, because load instructions can fetch data
from cache lines rather than always read data from DRAM, the driver can
boost memory coping performance. After using the cacheable mapping, the
driver uses dma_sync_single_for_cpu() to invalidate cacheline prior to
read bounce buffer so can avoid read stale trace data.
By measurement the duration for function tmc_update_etr_buffer() with
ftrace function_graph tracer, it shows the performance significant
improvement for copying 4MiB data from bounce buffer:
# echo tmc_etr_get_data_flat_buf > set_graph_notrace // avoid noise
# echo tmc_update_etr_buffer > set_graph_function
# echo function_graph > current_tracer
before:
# CPU DURATION FUNCTION CALLS
# | | | | | | |
2) | tmc_update_etr_buffer() {
...
2) # 8148.320 us | }
after:
# CPU DURATION FUNCTION CALLS
# | | | | | | |
2) | tmc_update_etr_buffer() {
...
2) # 2463.980 us | }
Signed-off-by: Leo Yan <leo.yan(a)linaro.org>
---
Changes from v1:
Set "flat_buf->daddr" to 0 when fails to map DMA region; and dropped the
unexpected if condition change in tmc_etr_free_flat_buf().
.../hwtracing/coresight/coresight-tmc-etr.c | 56 ++++++++++++++++---
1 file changed, 49 insertions(+), 7 deletions(-)
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index acdb59e0e661..888b0f929d33 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -21,6 +21,7 @@
struct etr_flat_buf {
struct device *dev;
+ struct page *pages;
dma_addr_t daddr;
void *vaddr;
size_t size;
@@ -600,6 +601,7 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
{
struct etr_flat_buf *flat_buf;
struct device *real_dev = drvdata->csdev->dev.parent;
+ ssize_t aligned_size;
/* We cannot reuse existing pages for flat buf */
if (pages)
@@ -609,11 +611,18 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
if (!flat_buf)
return -ENOMEM;
- flat_buf->vaddr = dma_alloc_coherent(real_dev, etr_buf->size,
- &flat_buf->daddr, GFP_KERNEL);
- if (!flat_buf->vaddr) {
- kfree(flat_buf);
- return -ENOMEM;
+ aligned_size = PAGE_ALIGN(etr_buf->size);
+ flat_buf->pages = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO,
+ get_order(aligned_size));
+ if (!flat_buf->pages)
+ goto fail_alloc_pages;
+
+ flat_buf->vaddr = page_address(flat_buf->pages);
+ flat_buf->daddr = dma_map_page(real_dev, flat_buf->pages, 0,
+ aligned_size, DMA_FROM_DEVICE);
+ if (dma_mapping_error(real_dev, flat_buf->daddr)) {
+ flat_buf->daddr = 0;
+ goto fail_dma_map_page;
}
flat_buf->size = etr_buf->size;
@@ -622,6 +631,12 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
etr_buf->mode = ETR_MODE_FLAT;
etr_buf->private = flat_buf;
return 0;
+
+fail_dma_map_page:
+ __free_pages(flat_buf->pages, get_order(aligned_size));
+fail_alloc_pages:
+ kfree(flat_buf);
+ return -ENOMEM;
}
static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
@@ -630,15 +645,20 @@ static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
if (flat_buf && flat_buf->daddr) {
struct device *real_dev = flat_buf->dev->parent;
+ ssize_t aligned_size = PAGE_ALIGN(etr_buf->size);
- dma_free_coherent(real_dev, flat_buf->size,
- flat_buf->vaddr, flat_buf->daddr);
+ dma_unmap_page(real_dev, flat_buf->daddr, aligned_size,
+ DMA_FROM_DEVICE);
+ __free_pages(flat_buf->pages, get_order(aligned_size));
}
kfree(flat_buf);
}
static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
{
+ struct etr_flat_buf *flat_buf = etr_buf->private;
+ struct device *real_dev = flat_buf->dev->parent;
+
/*
* Adjust the buffer to point to the beginning of the trace data
* and update the available trace data.
@@ -648,6 +668,28 @@ static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
etr_buf->len = etr_buf->size;
else
etr_buf->len = rwp - rrp;
+
+ if (etr_buf->offset + etr_buf->len > etr_buf->size) {
+ int len1, len2;
+
+ /*
+ * If trace data is wrapped around, sync AUX bounce buffer
+ * for two chunks: "len1" is for the trace date length at
+ * the tail of bounce buffer, and "len2" is the length from
+ * the start of the buffer after wrapping around.
+ */
+ len1 = etr_buf->size - etr_buf->offset;
+ len2 = etr_buf->len - len1;
+ dma_sync_single_for_cpu(real_dev,
+ flat_buf->daddr + etr_buf->offset,
+ len1, DMA_FROM_DEVICE);
+ dma_sync_single_for_cpu(real_dev, flat_buf->daddr,
+ len2, DMA_FROM_DEVICE);
+ } else {
+ dma_sync_single_for_cpu(real_dev,
+ flat_buf->daddr + etr_buf->offset,
+ etr_buf->len, DMA_FROM_DEVICE);
+ }
}
static ssize_t tmc_etr_get_data_flat_buf(struct etr_buf *etr_buf,
--
2.25.1
Current code syncs the buffer range is [offset, offset+len), it doesn't
consider the case when the trace data is wrapped around, in this case
'offset+len' is bigger than 'etr_buf->size'. Thus it syncs buffer out
of the memory buffer, and it also misses to sync buffer from the start
of the memory.
This patch corrects the memory sync ranges, when detects the wrapping
around case, it splits into two chunks: one chunk is the tail of the
buffer and another chunk is from the start of the buffer after wrapping
around.
Signed-off-by: Leo Yan <leo.yan(a)linaro.org>
---
.../hwtracing/coresight/coresight-tmc-etr.c | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index 888b0f929d33..a1afefcbf175 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -780,7 +780,23 @@ static void tmc_etr_sync_sg_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
else
etr_buf->len = ((w_offset < r_offset) ? etr_buf->size : 0) +
w_offset - r_offset;
- tmc_sg_table_sync_data_range(table, r_offset, etr_buf->len);
+
+ if (r_offset + etr_buf->len > etr_buf->size) {
+ int len1, len2;
+
+ /*
+ * If trace data is wrapped around, sync AUX bounce buffer
+ * for two chunks: "len1" is for the trace date length at
+ * the tail of bounce buffer, and "len2" is the length from
+ * the start of the buffer after wrapping around.
+ */
+ len1 = etr_buf->size - r_offset;
+ len2 = etr_buf->len - len1;
+ tmc_sg_table_sync_data_range(table, r_offset, len1);
+ tmc_sg_table_sync_data_range(table, 0, len2);
+ } else {
+ tmc_sg_table_sync_data_range(table, r_offset, etr_buf->len);
+ }
}
static const struct etr_buf_operations etr_sg_buf_ops = {
--
2.25.1
Hi Adrian,
On Sat, Jul 10, 2021 at 03:36:53PM +0300, Adrian Hunter wrote:
> On 4/07/21 10:16 am, Leo Yan wrote:
> > Since the __sync functions have been dropped, This patch removes unused
> > build and checking for HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT in perf tool.
> >
> > Note, there have a test for SYNC_COMPARE_AND_SWAP and the test file is
> > located in build/feature/test-sync-compare-and-swap.c. Since there
> > still has several components using the sync functions, it's deliberately
> > to not be removed.
>
> I don't quite follow that. If they aren't using the feature test
> macro, then why keep the feature test?
There are files are still using __sync_xxx_compare_and_swap() functions,
e.g. in the folder tools/testing/selftests/bpf. On the other hand,
after drop __sync functions from perf, there have no any Makefile check
the feature 'feature-sync-compare-and-swap'. So it's safe to remove the
feature test.
Sorry for confusion. Will drop the feature test in new patch set.
Thanks,
Leo
On Sat, Jul 10, 2021 at 03:34:24PM +0300, Adrian Hunter wrote:
> On 4/07/21 10:16 am, Leo Yan wrote:
> > The main purpose for using __sync built-in functions is to support
> > compat mode for 32-bit perf with 64-bit kernel. But using these
> > built-in functions might cause couple potential issues.
> >
> > Firstly, __sync functions originally support Intel Itanium processoer [1]
> > but it cannot promise to support all 32-bit archs. Now these
> > functions have become the legacy functions.
> >
> > As Peter also pointed out the logic issue in the function
> > auxtrace_mmap__write_tail(), it does a cmpxchg with 0 values to load
> > old_tail, and then executes a further cmpxchg with old_tail to write
> > the new tail. If consider the aux_tail might be assigned to '0' in the
> > middle of loops, this can introduce mess for AUX buffer if the kernel
> > fetches the temporary value '0'.
>
> That is not exactly true. The definition of __sync_*_compare_and_swap is
> "if the current value of *ptr is oldval, then write newval into *pt"
> so replacing zero with zero won't make any difference, but it will return
> the old value in any case. Probably better to leave out that paragraph.
Okay, I admit the paragraph is not right, will drop it to avoid
confusion. Thanks for review!
Leo
This patchset represents the second phase of CoreSight configuration
management.
1) API updated to allow dynamic load and unload of configurations and
features. Dependency management between loaded sets is added.
2) New configuration and feature sets can be added using a loadable module.
An example in /samples/coresight is provided to demonstrate this.
3) Resource management API is added. This allows the system to ensure that
loaded configurations and features are only loaded onto devices that can
support them.
Further - it ensures that configurations with multiple features cannot over
allocate resources.
4) configfs can be used to activate a configuration which will then be used
when controlling tracing using sysfs.
5) Resource management is added to ETMv4 configurations. This allows current
and future features and configurations to be defined in terms of resources
used as well as registers to be programmed.
Defining features in this way allows the resource management to operate
correctly.
The perf event parsing is also adjusted to allow the ETM resources requested
on the command line (e.g. address filters, etc) to be correctly handled
using resoruce management alongside the complex configurations such as
autofdo.
Applies to coresight/next - which is 5.13-rc1 + initial Coresight configuration
patchset.
To follow in future revisions / sets:-
a) load of additional config and features by configfs
b) ECT and CTI and other Coresight components support for configuration and
features.
Mike Leach (8):
coresight: syscfg: Update API to allow dynamic load and unload
coresight: syscfg: Update load API for config loadable modules
coresight: syscfg: Example CoreSight configuration loadable module
coresight: configfs: Allow configfs to activate configuration.
coresight: syscfg: Add API to check and validate device resources.
coresight: etm4x: syscfg: Add resource management to etm4x.
coresight: etm4x: Update perf event resource handling.
coresight: etm4x: Update configuration example.
MAINTAINERS | 1 +
.../hwtracing/coresight/coresight-cfg-afdo.c | 38 +-
.../coresight/coresight-cfg-preload.c | 9 +-
.../hwtracing/coresight/coresight-config.c | 71 ++-
.../hwtracing/coresight/coresight-config.h | 45 +-
.../hwtracing/coresight/coresight-etm4x-cfg.c | 533 ++++++++++++++++++
.../hwtracing/coresight/coresight-etm4x-cfg.h | 196 ++++++-
.../coresight/coresight-etm4x-core.c | 250 +++-----
.../coresight/coresight-syscfg-configfs.c | 87 +++
.../coresight/coresight-syscfg-configfs.h | 4 +
.../hwtracing/coresight/coresight-syscfg.c | 390 +++++++++++--
.../hwtracing/coresight/coresight-syscfg.h | 38 +-
include/linux/coresight.h | 2 +
samples/Kconfig | 9 +
samples/Makefile | 1 +
samples/coresight/Makefile | 4 +
samples/coresight/coresight-cfg-sample.c | 73 +++
17 files changed, 1511 insertions(+), 240 deletions(-)
create mode 100644 samples/coresight/Makefile
create mode 100644 samples/coresight/coresight-cfg-sample.c
--
2.17.1
This patch series is to refine the memory barriers for AUX ring buffer.
Patches 01 ~ 04 to address the barriers usage in the kernel. The first
patch is to make clear comment for how to use the barriers between the
data store and aux_head store, this asks the driver to make sure the
data is visible. Patches 02 ~ 04 is to refine the drivers for barriers
after the data store.
Patches 05 ~ 07 is to drop the legacy __sync functions, and polish for
duplicate code and cleanup the build after SYNC_COMPARE_AND_SWAP is not
used.
Patch 08 is to use WRITE_ONCE() for updating aux_tail.
Since the 64-bit value's atomicity is not promised on 32-bit perf, the
last two patches tries to fixup for perf tool when it runs in compat
mode. Patch 09 introduces a new global variable to indicate the kernel
runs in 64-bit mode which can be used to confirm if in compat mode;
patch 10 introduces variant functions for accessing AUX head/tail, it
can resolve the aotmicity issue for reading head pointer, and for the
tail write overflow issue it returns error to notify the tool to exit.
Have testes the patches on Arm64 Juno platform.
Changes from v2:
- Removed auxtrace_mmap__read_snapshot_head(), which has the duplicated
code with auxtrace_mmap__read_head();
- Cleanuped the build for HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT (Adrian);
- Added global variable "kernel_is_64_bit" (Adrian);
- Added compat variants compat_auxtrace_mmap__{read_head|write_tail}
(Adrian).
Leo Yan (10):
perf/ring_buffer: Add comment for barriers on AUX ring buffer
coresight: tmc-etr: Add barrier after updating AUX ring buffer
coresight: tmc-etf: Add comment for store ordering
perf/x86: Add barrier after updating bts
perf auxtrace: Drop legacy __sync functions
perf auxtrace: Remove auxtrace_mmap__read_snapshot_head()
perf: Cleanup for HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT
perf auxtrace: Use WRITE_ONCE() for updating aux_tail
perf env: Set kernel bit mode
perf auxtrace: Add compat_auxtrace_mmap__{read_head|write_tail}
arch/x86/events/intel/bts.c | 3 +
.../hwtracing/coresight/coresight-tmc-etf.c | 6 +
.../hwtracing/coresight/coresight-tmc-etr.c | 8 ++
kernel/events/ring_buffer.c | 9 ++
tools/perf/Makefile.config | 4 -
tools/perf/util/auxtrace.c | 19 ++-
tools/perf/util/auxtrace.h | 109 ++++++++++++++----
tools/perf/util/env.c | 17 ++-
tools/perf/util/env.h | 1 +
9 files changed, 136 insertions(+), 40 deletions(-)
--
2.25.1
This patch series is to correct the pointer usages for the snapshot
mode.
Patch 01 is to polish code, it removes the redundant header maintained
in tmc-etr driver and directly uses pointer perf_output_handle::head.
Patch 02 removes the callback cs_etm_find_snapshot() which wrongly
calculates the buffer headers; we can simply use the perf's common
function __auxtrace_mmap__read() for headers calculation. Patch 03 is
to update comments in CoreSight drivers to reflect the changes
introduced by patch 02.
This patch can be cleanly applied on the mainline kernel with:
commit dbe69e433722 ("Merge tag 'net-next-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next")
And it has been tested on Arm64 Juno board.
Changes from v1:
- Dropped the patch "coresight: etm-perf: Correct buffer syncing for
snapshot", after a long discussion, the patch doesn't really resolve
any issues for snapshot mode. And another reason for unlike this
patch is now the CoreSight and Intel-PT have the consistent behaviour
(Suzuki/James/Mathieu);
- Added the patch 03 to updates drivers' comments (James);
- Added Suzuki's review tag for patch 01;
- Added James' review and testing tags for patch 02.
Leo Yan (3):
coresight: tmc-etr: Use perf_output_handle::head for AUX ring buffer
perf cs-etm: Remove callback cs_etm_find_snapshot()
coresight: Update comments for removing cs_etm_find_snapshot()
drivers/hwtracing/coresight/coresight-etb10.c | 2 +-
.../hwtracing/coresight/coresight-tmc-etf.c | 2 +-
.../hwtracing/coresight/coresight-tmc-etr.c | 12 +-
tools/perf/arch/arm/util/cs-etm.c | 133 ------------------
4 files changed, 6 insertions(+), 143 deletions(-)
--
2.25.1
Currently, timeless mode starts the decode on PERF_RECORD_EXIT, and
non-timeless mode starts decoding on the fist PERF_RECORD_AUX record.
This can cause the "data has no samples!" error if the first
PERF_RECORD_AUX record comes before the first (or any relevant)
PERF_RECORD_MMAP2 record because the mmaps are required by the decoder
to access the binary data.
This change pushes the start of non-timeless decoding to the very end of
parsing the file. The PERF_RECORD_EXIT event can't be used because it
might not exist in system-wide or snapshot modes.
I have not been able to find the exact cause for the events to be
intermittently in the wrong order in the basic scenario:
perf record -e cs_etm/@tmc_etr0/u top
But it can be made to happen every time with the --delay option. This is
because "enable_on_exec" is disabled, which causes tracing to start
before the process to be launched is exec'd. For example:
perf record -e cs_etm/@tmc_etr0/u --delay=1 top
perf report -D | grep 'AUX\|MAP'
0 16714475632740 0x520 [0x40]: PERF_RECORD_AUX offset: 0 size: 0x30 flags: 0 []
0 16714476494960 0x5d0 [0x40]: PERF_RECORD_AUX offset: 0x30 size: 0x30 flags: 0 []
0 16714478208900 0x660 [0x40]: PERF_RECORD_AUX offset: 0x60 size: 0x30 flags: 0 []
4294967295 16714478293340 0x700 [0x70]: PERF_RECORD_MMAP2 8712/8712: [0x557a460000(0x54000) @ 0 00:17 5329258 0]: r-xp /usr/bin/top
4294967295 16714478353020 0x770 [0x88]: PERF_RECORD_MMAP2 8712/8712: [0x7f86f72000(0x34000) @ 0 00:17 5214354 0]: r-xp /usr/lib/aarch64-linux-gnu/ld-2.31.so
Another scenario in which decoding from the first aux record fails is a
workload that forks. Although the aux record comes after 'bash', it
comes before 'top', which is what we are interested in. For example:
perf record -e cs_etm/@tmc_etr0/u -- bash -c top
perf report -D | grep 'AUX\|MAP'
4294967295 16853946421300 0x510 [0x70]: PERF_RECORD_MMAP2 8723/8723: [0x558f280000(0x142000) @ 0 00:17 5213953 0]: r-xp /usr/bin/bash
4294967295 16853946543560 0x580 [0x88]: PERF_RECORD_MMAP2 8723/8723: [0x7fbba6e000(0x34000) @ 0 00:17 5214354 0]: r-xp /usr/lib/aarch64-linux-gnu/ld-2.31.so
4294967295 16853946628420 0x608 [0x68]: PERF_RECORD_MMAP2 8723/8723: [0x7fbba9e000(0x1000) @ 0 00:00 0 0]: r-xp [vdso]
0 16853947067300 0x690 [0x40]: PERF_RECORD_AUX offset: 0 size: 0x3a60 flags: 0 []
...
0 16853966602580 0x1758 [0x40]: PERF_RECORD_AUX offset: 0xc2470 size: 0x30 flags: 0 []
4294967295 16853967119860 0x1818 [0x70]: PERF_RECORD_MMAP2 8723/8723: [0x5559e70000(0x54000) @ 0 00:17 5329258 0]: r-xp /usr/bin/top
4294967295 16853967181620 0x1888 [0x88]: PERF_RECORD_MMAP2 8723/8723: [0x7f9ed06000(0x34000) @ 0 00:17 5214354 0]: r-xp /usr/lib/aarch64-linux-gnu/ld-2.31.so
4294967295 16853967237180 0x1910 [0x68]: PERF_RECORD_MMAP2 8723/8723: [0x7f9ed36000(0x1000) @ 0 00:00 0 0]: r-xp [vdso]
A third scenario is when the majority of time is spent in a shared
library that is not loaded at startup. For example a dynamically loaded
plugin.
Testing
=======
Testing was done by checking if any samples that are present in the
old output are missing from the new output. Timestamps must be
stripped out with awk because now they are set to the last AUX sample,
rather than the first:
./perf script $4 | awk '!($4="")' > new.script
./perf-default script $4 | awk '!($4="")' > default.script
comm -13 <(sort -u new.script) <(sort -u default.script)
Testing showed that the new output is a superset of the old. When lines
appear in the comm output, it is not because they are missing but
because [unknown] is now resolved to sensible locations. For example
last putp branch here now resolves to libtinfo, so it's not missing
from the output, but is actually improved:
Old:
top 305 [001] 1 branches:uH: 402830 _init+0x30 (/usr/bin/top.procps) => 404a1c [unknown] (/usr/bin/top.procps)
top 305 [001] 1 branches:uH: 404a20 [unknown] (/usr/bin/top.procps) => 402970 putp@plt+0x0 (/usr/bin/top.procps)
top 305 [001] 1 branches:uH: 40297c putp@plt+0xc (/usr/bin/top.procps) => 0 [unknown] ([unknown])
New:
top 305 [001] 1 branches:uH: 402830 _init+0x30 (/usr/bin/top.procps) => 404a1c [unknown] (/usr/bin/top.procps)
top 305 [001] 1 branches:uH: 404a20 [unknown] (/usr/bin/top.procps) => 402970 putp@plt+0x0 (/usr/bin/top.procps)
top 305 [001] 1 branches:uH: 40297c putp@plt+0xc (/usr/bin/top.procps) => 7f8ab39208 putp+0x0 (/lib/libtinfo.so.5.9)
In the following two modes, decoding now works and the "data has no
samples!" error is not displayed any more:
perf record -e cs_etm/@tmc_etr0/u -- bash -c top
perf record -e cs_etm/@tmc_etr0/u --delay=1 top
In snapshot mode, there is also an improvement to decoding. Previously
samples for the 'kill' process that was used to send SIGUSR2 were
completely missing, because the process hadn't started yet. But now
there are additional samples present:
perf record -e cs_etm/@tmc_etr0/u --snapshot -a
perf script
stress 19380 [003] 161627.938153: 1000000 instructions:uH: aaaabb612fb4 [unknown] (/usr/bin/stress)
kill 19644 [000] 161627.938153: 1000000 instructions:uH: ffffae0ef210 [unknown] (/lib/aarch64-linux-gnu/ld-2.27.so)
stress 19380 [003] 161627.938153: 1000000 instructions:uH: ffff9e754d40 random_r+0x20 (/lib/aarch64-linux-gnu/libc-2.27.so)
Also tested was the round trip of 'perf inject' followed by 'perf
report' which has the same differences and improvements.
Signed-off-by: James Clark <james.clark(a)arm.com>
---
tools/perf/util/cs-etm.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 57aea2c7fc77..ceed0b038796 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -2407,6 +2407,11 @@ static int cs_etm__process_event(struct perf_session *session,
return err;
}
+ /*
+ * Don't wait for cs_etm__flush_events() in per-thread/timeless mode to start the decode. We
+ * need the tid of the PERF_RECORD_EXIT event to assign to the synthesised samples because
+ * ETM_OPT_CTXTID is not enabled.
+ */
if (etm->timeless_decoding &&
event->header.type == PERF_RECORD_EXIT)
return cs_etm__process_timeless_queues(etm,
@@ -2424,7 +2429,6 @@ static int cs_etm__process_event(struct perf_session *session,
* onwards.
*/
etm->latest_kernel_timestamp = sample_kernel_timestamp;
- return cs_etm__process_queues(etm);
}
return 0;
--
2.28.0