 
            6.6-stable review patch. If anyone has any objections, please let me know.
------------------
From: Robin Murphy robin.murphy@arm.com
[ Upstream commit 7633ec2c262fab3e7c5bf3cd3876b5748f584a57 ]
The bitmap-based scheme for tracking DTC counter usage turns out to be a complete dead-end for its imagined purpose, since by the time we have to keep track of a per-DTC counter index anyway, we already have enough information to make the bitmap itself redundant. Revert the remains of it back to almost the original scheme, but now expanded to track per-DTC indices, in preparation for making use of them in anger.
Note that since cycle count events always use a dedicated counter on a single DTC, we reuse the field to encode their DTC index directly.
Signed-off-by: Robin Murphy robin.murphy@arm.com Reviewed-by: Ilkka Koskinen ilkka@os.amperecomputing.com Link: https://lore.kernel.org/r/5f6ade76b47f033836d7a36c03555da896dfb4a3.169782421... Signed-off-by: Will Deacon will@kernel.org Stable-dep-of: e79634b53e39 ("perf/arm-cmn: Refactor node ID handling. Again.") Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/perf/arm-cmn.c | 126 +++++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 62 deletions(-)
diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 2c684e49a6fc7..1c9e42aeb7004 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -281,16 +281,13 @@ struct arm_cmn_node { u16 id, logid; enum cmn_node_type type;
- int dtm; - union { - /* DN/HN-F/CXHA */ - struct { - u8 val : 4; - u8 count : 4; - } occupid[SEL_MAX]; - /* XP */ - u8 dtc; - }; + u8 dtm; + s8 dtc; + /* DN/HN-F/CXHA */ + struct { + u8 val : 4; + u8 count : 4; + } occupid[SEL_MAX]; union { u8 event[4]; __le32 event_sel; @@ -540,12 +537,12 @@ static int arm_cmn_map_show(struct seq_file *s, void *data)
seq_puts(s, "\n |"); for (x = 0; x < cmn->mesh_x; x++) { - u8 dtc = cmn->xps[xp_base + x].dtc; + s8 dtc = cmn->xps[xp_base + x].dtc;
- if (dtc & (dtc - 1)) + if (dtc < 0) seq_puts(s, " DTC ?? |"); else - seq_printf(s, " DTC %ld |", __ffs(dtc)); + seq_printf(s, " DTC %d |", dtc); } seq_puts(s, "\n |"); for (x = 0; x < cmn->mesh_x; x++) @@ -589,8 +586,7 @@ static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id) {} struct arm_cmn_hw_event { struct arm_cmn_node *dn; u64 dtm_idx[4]; - unsigned int dtc_idx; - u8 dtcs_used; + s8 dtc_idx[CMN_MAX_DTCS]; u8 num_dns; u8 dtm_offset; bool wide_sel; @@ -600,6 +596,10 @@ struct arm_cmn_hw_event { #define for_each_hw_dn(hw, dn, i) \ for (i = 0, dn = hw->dn; i < hw->num_dns; i++, dn++)
+/* @i is the DTC number, @idx is the counter index on that DTC */ +#define for_each_hw_dtc_idx(hw, i, idx) \ + for (int i = 0, idx; i < CMN_MAX_DTCS; i++) if ((idx = hw->dtc_idx[i]) >= 0) + static struct arm_cmn_hw_event *to_cmn_hw(struct perf_event *event) { BUILD_BUG_ON(sizeof(struct arm_cmn_hw_event) > offsetof(struct hw_perf_event, target)); @@ -1429,12 +1429,11 @@ static void arm_cmn_init_counter(struct perf_event *event) { struct arm_cmn *cmn = to_cmn(event->pmu); struct arm_cmn_hw_event *hw = to_cmn_hw(event); - unsigned int i, pmevcnt = CMN_DT_PMEVCNT(hw->dtc_idx); u64 count;
- for (i = 0; hw->dtcs_used & (1U << i); i++) { - writel_relaxed(CMN_COUNTER_INIT, cmn->dtc[i].base + pmevcnt); - cmn->dtc[i].counters[hw->dtc_idx] = event; + for_each_hw_dtc_idx(hw, i, idx) { + writel_relaxed(CMN_COUNTER_INIT, cmn->dtc[i].base + CMN_DT_PMEVCNT(idx)); + cmn->dtc[i].counters[idx] = event; }
count = arm_cmn_read_dtm(cmn, hw, false); @@ -1447,11 +1446,9 @@ static void arm_cmn_event_read(struct perf_event *event) struct arm_cmn_hw_event *hw = to_cmn_hw(event); u64 delta, new, prev; unsigned long flags; - unsigned int i;
- if (hw->dtc_idx == CMN_DT_NUM_COUNTERS) { - i = __ffs(hw->dtcs_used); - delta = arm_cmn_read_cc(cmn->dtc + i); + if (CMN_EVENT_TYPE(event) == CMN_TYPE_DTC) { + delta = arm_cmn_read_cc(cmn->dtc + hw->dtc_idx[0]); local64_add(delta, &event->count); return; } @@ -1461,8 +1458,8 @@ static void arm_cmn_event_read(struct perf_event *event) delta = new - prev;
local_irq_save(flags); - for (i = 0; hw->dtcs_used & (1U << i); i++) { - new = arm_cmn_read_counter(cmn->dtc + i, hw->dtc_idx); + for_each_hw_dtc_idx(hw, i, idx) { + new = arm_cmn_read_counter(cmn->dtc + i, idx); delta += new << 16; } local_irq_restore(flags); @@ -1518,7 +1515,7 @@ static void arm_cmn_event_start(struct perf_event *event, int flags) int i;
if (type == CMN_TYPE_DTC) { - i = __ffs(hw->dtcs_used); + i = hw->dtc_idx[0]; writeq_relaxed(CMN_CC_INIT, cmn->dtc[i].base + CMN_DT_PMCCNTR); cmn->dtc[i].cc_active = true; } else if (type == CMN_TYPE_WP) { @@ -1549,7 +1546,7 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags) int i;
if (type == CMN_TYPE_DTC) { - i = __ffs(hw->dtcs_used); + i = hw->dtc_idx[0]; cmn->dtc[i].cc_active = false; } else if (type == CMN_TYPE_WP) { int wp_idx = arm_cmn_wp_idx(event); @@ -1735,12 +1732,19 @@ static int arm_cmn_event_init(struct perf_event *event) hw->dn = arm_cmn_node(cmn, type); if (!hw->dn) return -EINVAL; + + memset(hw->dtc_idx, -1, sizeof(hw->dtc_idx)); for (dn = hw->dn; dn->type == type; dn++) { if (bynodeid && dn->id != nodeid) { hw->dn++; continue; } hw->num_dns++; + if (dn->dtc < 0) + memset(hw->dtc_idx, 0, cmn->num_dtcs); + else + hw->dtc_idx[dn->dtc] = 0; + if (bynodeid) break; } @@ -1752,12 +1756,6 @@ static int arm_cmn_event_init(struct perf_event *event) nodeid, nid.x, nid.y, nid.port, nid.dev, type); return -EINVAL; } - /* - * Keep assuming non-cycles events count in all DTC domains; turns out - * it's hard to make a worthwhile optimisation around this, short of - * going all-in with domain-local counter allocation as well. - */ - hw->dtcs_used = (1U << cmn->num_dtcs) - 1;
return arm_cmn_validate_group(cmn, event); } @@ -1783,28 +1781,25 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event, } memset(hw->dtm_idx, 0, sizeof(hw->dtm_idx));
- for (i = 0; hw->dtcs_used & (1U << i); i++) - cmn->dtc[i].counters[hw->dtc_idx] = NULL; + for_each_hw_dtc_idx(hw, j, idx) + cmn->dtc[j].counters[idx] = NULL; }
static int arm_cmn_event_add(struct perf_event *event, int flags) { struct arm_cmn *cmn = to_cmn(event->pmu); struct arm_cmn_hw_event *hw = to_cmn_hw(event); - struct arm_cmn_dtc *dtc = &cmn->dtc[0]; struct arm_cmn_node *dn; enum cmn_node_type type = CMN_EVENT_TYPE(event); - unsigned int i, dtc_idx, input_sel; + unsigned int input_sel, i = 0;
if (type == CMN_TYPE_DTC) { - i = 0; while (cmn->dtc[i].cycles) if (++i == cmn->num_dtcs) return -ENOSPC;
cmn->dtc[i].cycles = event; - hw->dtc_idx = CMN_DT_NUM_COUNTERS; - hw->dtcs_used = 1U << i; + hw->dtc_idx[0] = i;
if (flags & PERF_EF_START) arm_cmn_event_start(event, 0); @@ -1812,17 +1807,22 @@ static int arm_cmn_event_add(struct perf_event *event, int flags) }
/* Grab a free global counter first... */ - dtc_idx = 0; - while (dtc->counters[dtc_idx]) - if (++dtc_idx == CMN_DT_NUM_COUNTERS) - return -ENOSPC; - - hw->dtc_idx = dtc_idx; + for_each_hw_dtc_idx(hw, j, idx) { + if (j > 0) { + idx = hw->dtc_idx[0]; + } else { + idx = 0; + while (cmn->dtc[j].counters[idx]) + if (++idx == CMN_DT_NUM_COUNTERS) + goto free_dtms; + } + hw->dtc_idx[j] = idx; + }
/* ...then the local counters to feed it. */ for_each_hw_dn(hw, dn, i) { struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset; - unsigned int dtm_idx, shift; + unsigned int dtm_idx, shift, d = 0; u64 reg;
dtm_idx = 0; @@ -1841,11 +1841,11 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
tmp = dtm->wp_event[wp_idx ^ 1]; if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) != - CMN_EVENT_WP_COMBINE(dtc->counters[tmp])) + CMN_EVENT_WP_COMBINE(cmn->dtc[d].counters[tmp])) goto free_dtms;
input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx; - dtm->wp_event[wp_idx] = dtc_idx; + dtm->wp_event[wp_idx] = hw->dtc_idx[d]; writel_relaxed(cfg, dtm->base + CMN_DTM_WPn_CONFIG(wp_idx)); } else { struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id); @@ -1865,7 +1865,7 @@ static int arm_cmn_event_add(struct perf_event *event, int flags) dtm->input_sel[dtm_idx] = input_sel; shift = CMN__PMEVCNTn_GLOBAL_NUM_SHIFT(dtm_idx); dtm->pmu_config_low &= ~(CMN__PMEVCNT0_GLOBAL_NUM << shift); - dtm->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, dtc_idx) << shift; + dtm->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, hw->dtc_idx[d]) << shift; dtm->pmu_config_low |= CMN__PMEVCNT_PAIRED(dtm_idx); reg = (u64)le32_to_cpu(dtm->pmu_config_high) << 32 | dtm->pmu_config_low; writeq_relaxed(reg, dtm->base + CMN_DTM_PMU_CONFIG); @@ -1893,7 +1893,7 @@ static void arm_cmn_event_del(struct perf_event *event, int flags) arm_cmn_event_stop(event, PERF_EF_UPDATE);
if (type == CMN_TYPE_DTC) - cmn->dtc[__ffs(hw->dtcs_used)].cycles = NULL; + cmn->dtc[hw->dtc_idx[0]].cycles = NULL; else arm_cmn_event_clear(cmn, event, hw->num_dns); } @@ -2074,7 +2074,6 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn) { struct arm_cmn_node *dn, *xp; int dtc_idx = 0; - u8 dtcs_present = (1 << cmn->num_dtcs) - 1;
cmn->dtc = devm_kcalloc(cmn->dev, cmn->num_dtcs, sizeof(cmn->dtc[0]), GFP_KERNEL); if (!cmn->dtc) @@ -2084,23 +2083,26 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
cmn->xps = arm_cmn_node(cmn, CMN_TYPE_XP);
+ if (cmn->part == PART_CMN600 && cmn->num_dtcs > 1) { + /* We do at least know that a DTC's XP must be in that DTC's domain */ + dn = arm_cmn_node(cmn, CMN_TYPE_DTC); + for (int i = 0; i < cmn->num_dtcs; i++) + arm_cmn_node_to_xp(cmn, dn + i)->dtc = i; + } + for (dn = cmn->dns; dn->type; dn++) { - if (dn->type == CMN_TYPE_XP) { - dn->dtc &= dtcs_present; + if (dn->type == CMN_TYPE_XP) continue; - }
xp = arm_cmn_node_to_xp(cmn, dn); + dn->dtc = xp->dtc; dn->dtm = xp->dtm; if (cmn->multi_dtm) dn->dtm += arm_cmn_nid(cmn, dn->id).port / 2;
if (dn->type == CMN_TYPE_DTC) { - int err; - /* We do at least know that a DTC's XP must be in that DTC's domain */ - if (xp->dtc == 0xf) - xp->dtc = 1 << dtc_idx; - err = arm_cmn_init_dtc(cmn, dn, dtc_idx++); + int err = arm_cmn_init_dtc(cmn, dn, dtc_idx++); + if (err) return err; } @@ -2258,9 +2260,9 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) cmn->mesh_x = xp->logid;
if (cmn->part == PART_CMN600) - xp->dtc = 0xf; + xp->dtc = -1; else - xp->dtc = 1 << arm_cmn_dtc_domain(cmn, xp_region); + xp->dtc = arm_cmn_dtc_domain(cmn, xp_region);
xp->dtm = dtm - cmn->dtms; arm_cmn_init_dtm(dtm++, xp, 0);