1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
|
// SPDX-License-Identifier: GPL-2.0-only
/*
* Resource Director Technology(RDT)
* - Intel Application Energy Telemetry
*
* Copyright (C) 2025 Intel Corporation
*
* Author:
* Tony Luck <tony.luck@intel.com>
*/
#define pr_fmt(fmt) "resctrl: " fmt
#include <linux/bits.h>
#include <linux/compiler_types.h>
#include <linux/container_of.h>
#include <linux/cpumask.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/gfp_types.h>
#include <linux/init.h>
#include <linux/intel_pmt_features.h>
#include <linux/intel_vsec.h>
#include <linux/io.h>
#include <linux/minmax.h>
#include <linux/printk.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/resctrl.h>
#include <linux/resctrl_types.h>
#include <linux/slab.h>
#include <linux/stddef.h>
#include <linux/topology.h>
#include <linux/types.h>
#include "internal.h"
/**
* struct pmt_event - Telemetry event.
* @id: Resctrl event id.
* @idx: Counter index within each per-RMID block of counters.
* @bin_bits: Zero for integer valued events, else number bits in fraction
* part of fixed-point.
*/
struct pmt_event {
enum resctrl_event_id id;
unsigned int idx;
unsigned int bin_bits;
};
#define EVT(_id, _idx, _bits) { .id = _id, .idx = _idx, .bin_bits = _bits }
/**
* struct event_group - Events with the same feature type ("energy" or "perf") and GUID.
* @pfname: PMT feature name ("energy" or "perf") of this event group.
* Used by boot rdt= option.
* @pfg: Points to the aggregated telemetry space information
* returned by the intel_pmt_get_regions_by_feature()
* call to the INTEL_PMT_TELEMETRY driver that contains
* data for all telemetry regions of type @pfname.
* Valid if the system supports the event group,
* NULL otherwise.
* @force_off: True when "rdt" command line or architecture code disables
* this event group due to insufficient RMIDs.
* @force_on: True when "rdt" command line overrides disable of this
* event group.
* @guid: Unique number per XML description file.
* @num_rmid: Number of RMIDs supported by this group. May be
* adjusted downwards if enumeration from
* intel_pmt_get_regions_by_feature() indicates fewer
* RMIDs can be tracked simultaneously.
* @mmio_size: Number of bytes of MMIO registers for this group.
* @num_events: Number of events in this group.
* @evts: Array of event descriptors.
*/
struct event_group {
/* Data fields for additional structures to manage this group. */
const char *pfname;
struct pmt_feature_group *pfg;
bool force_off, force_on;
/* Remaining fields initialized from XML file. */
u32 guid;
u32 num_rmid;
size_t mmio_size;
unsigned int num_events;
struct pmt_event evts[] __counted_by(num_events);
};
#define XML_MMIO_SIZE(num_rmids, num_events, num_extra_status) \
(((num_rmids) * (num_events) + (num_extra_status)) * sizeof(u64))
/*
* Link: https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-ENERGY/cwf_aggregator.xml
*/
static struct event_group energy_0x26696143 = {
.pfname = "energy",
.guid = 0x26696143,
.num_rmid = 576,
.mmio_size = XML_MMIO_SIZE(576, 2, 3),
.num_events = 2,
.evts = {
EVT(PMT_EVENT_ENERGY, 0, 18),
EVT(PMT_EVENT_ACTIVITY, 1, 18),
}
};
/*
* Link: https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-PERF/cwf_aggregator.xml
*/
static struct event_group perf_0x26557651 = {
.pfname = "perf",
.guid = 0x26557651,
.num_rmid = 576,
.mmio_size = XML_MMIO_SIZE(576, 7, 3),
.num_events = 7,
.evts = {
EVT(PMT_EVENT_STALLS_LLC_HIT, 0, 0),
EVT(PMT_EVENT_C1_RES, 1, 0),
EVT(PMT_EVENT_UNHALTED_CORE_CYCLES, 2, 0),
EVT(PMT_EVENT_STALLS_LLC_MISS, 3, 0),
EVT(PMT_EVENT_AUTO_C6_RES, 4, 0),
EVT(PMT_EVENT_UNHALTED_REF_CYCLES, 5, 0),
EVT(PMT_EVENT_UOPS_RETIRED, 6, 0),
}
};
static struct event_group *known_event_groups[] = {
&energy_0x26696143,
&perf_0x26557651,
};
#define for_each_event_group(_peg) \
for (_peg = known_event_groups; \
_peg < &known_event_groups[ARRAY_SIZE(known_event_groups)]; \
_peg++)
bool intel_handle_aet_option(bool force_off, char *tok)
{
struct event_group **peg;
bool ret = false;
u32 guid = 0;
char *name;
if (!tok)
return false;
name = strsep(&tok, ":");
if (tok && kstrtou32(tok, 16, &guid))
return false;
for_each_event_group(peg) {
if (strcmp(name, (*peg)->pfname))
continue;
if (guid && (*peg)->guid != guid)
continue;
if (force_off)
(*peg)->force_off = true;
else
(*peg)->force_on = true;
ret = true;
}
return ret;
}
static bool skip_telem_region(struct telemetry_region *tr, struct event_group *e)
{
if (tr->guid != e->guid)
return true;
if (tr->plat_info.package_id >= topology_max_packages()) {
pr_warn("Bad package %u in guid 0x%x\n", tr->plat_info.package_id,
tr->guid);
return true;
}
if (tr->size != e->mmio_size) {
pr_warn("MMIO space wrong size (%zu bytes) for guid 0x%x. Expected %zu bytes.\n",
tr->size, e->guid, e->mmio_size);
return true;
}
return false;
}
static bool group_has_usable_regions(struct event_group *e, struct pmt_feature_group *p)
{
bool usable_regions = false;
for (int i = 0; i < p->count; i++) {
if (skip_telem_region(&p->regions[i], e)) {
/*
* Clear the address field of regions that did not pass the checks in
* skip_telem_region() so they will not be used by intel_aet_read_event().
* This is safe to do because intel_pmt_get_regions_by_feature() allocates
* a new pmt_feature_group structure to return to each caller and only makes
* use of the pmt_feature_group::kref field when intel_pmt_put_feature_group()
* returns the structure.
*/
p->regions[i].addr = NULL;
continue;
}
usable_regions = true;
}
return usable_regions;
}
static bool all_regions_have_sufficient_rmid(struct event_group *e, struct pmt_feature_group *p)
{
struct telemetry_region *tr;
for (int i = 0; i < p->count; i++) {
if (!p->regions[i].addr)
continue;
tr = &p->regions[i];
if (tr->num_rmids < e->num_rmid) {
e->force_off = true;
return false;
}
}
return true;
}
static bool enable_events(struct event_group *e, struct pmt_feature_group *p)
{
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl;
int skipped_events = 0;
if (e->force_off)
return false;
if (!group_has_usable_regions(e, p))
return false;
/*
* Only enable event group with insufficient RMIDs if the user requested
* it from the kernel command line.
*/
if (!all_regions_have_sufficient_rmid(e, p) && !e->force_on) {
pr_info("%s %s:0x%x monitoring not enabled due to insufficient RMIDs\n",
r->name, e->pfname, e->guid);
return false;
}
for (int i = 0; i < p->count; i++) {
if (!p->regions[i].addr)
continue;
/*
* e->num_rmid only adjusted lower if user (via rdt= kernel
* parameter) forces an event group with insufficient RMID
* to be enabled.
*/
e->num_rmid = min(e->num_rmid, p->regions[i].num_rmids);
}
for (int j = 0; j < e->num_events; j++) {
if (!resctrl_enable_mon_event(e->evts[j].id, true,
e->evts[j].bin_bits, &e->evts[j]))
skipped_events++;
}
if (e->num_events == skipped_events) {
pr_info("No events enabled in %s %s:0x%x\n", r->name, e->pfname, e->guid);
return false;
}
if (r->mon.num_rmid)
r->mon.num_rmid = min(r->mon.num_rmid, e->num_rmid);
else
r->mon.num_rmid = e->num_rmid;
if (skipped_events)
pr_info("%s %s:0x%x monitoring detected (skipped %d events)\n", r->name,
e->pfname, e->guid, skipped_events);
else
pr_info("%s %s:0x%x monitoring detected\n", r->name, e->pfname, e->guid);
return true;
}
static enum pmt_feature_id lookup_pfid(const char *pfname)
{
if (!strcmp(pfname, "energy"))
return FEATURE_PER_RMID_ENERGY_TELEM;
else if (!strcmp(pfname, "perf"))
return FEATURE_PER_RMID_PERF_TELEM;
pr_warn("Unknown PMT feature name '%s'\n", pfname);
return FEATURE_INVALID;
}
/*
* Request a copy of struct pmt_feature_group for each event group. If there is
* one, the returned structure has an array of telemetry_region structures,
* each element of the array describes one telemetry aggregator. The
* telemetry aggregators may have different GUIDs so obtain duplicate struct
* pmt_feature_group for event groups with same feature type but different
* GUID. Post-processing ensures an event group can only use the telemetry
* aggregators that match its GUID. An event group keeps a pointer to its
* struct pmt_feature_group to indicate that its events are successfully
* enabled.
*/
bool intel_aet_get_events(void)
{
struct pmt_feature_group *p;
enum pmt_feature_id pfid;
struct event_group **peg;
bool ret = false;
for_each_event_group(peg) {
pfid = lookup_pfid((*peg)->pfname);
p = intel_pmt_get_regions_by_feature(pfid);
if (IS_ERR_OR_NULL(p))
continue;
if (enable_events(*peg, p)) {
(*peg)->pfg = p;
ret = true;
} else {
intel_pmt_put_feature_group(p);
}
}
return ret;
}
void __exit intel_aet_exit(void)
{
struct event_group **peg;
for_each_event_group(peg) {
if ((*peg)->pfg) {
intel_pmt_put_feature_group((*peg)->pfg);
(*peg)->pfg = NULL;
}
}
}
#define DATA_VALID BIT_ULL(63)
#define DATA_BITS GENMASK_ULL(62, 0)
/*
* Read counter for an event on a domain (summing all aggregators on the
* domain). If an aggregator hasn't received any data for a specific RMID,
* the MMIO read indicates that data is not valid. Return success if at
* least one aggregator has valid data.
*/
int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val)
{
struct pmt_event *pevt = arch_priv;
struct event_group *e;
bool valid = false;
u64 total = 0;
u64 evtcount;
void *pevt0;
u32 idx;
pevt0 = pevt - pevt->idx;
e = container_of(pevt0, struct event_group, evts);
idx = rmid * e->num_events;
idx += pevt->idx;
if (idx * sizeof(u64) + sizeof(u64) > e->mmio_size) {
pr_warn_once("MMIO index %u out of range\n", idx);
return -EIO;
}
for (int i = 0; i < e->pfg->count; i++) {
if (!e->pfg->regions[i].addr)
continue;
if (e->pfg->regions[i].plat_info.package_id != domid)
continue;
evtcount = readq(e->pfg->regions[i].addr + idx * sizeof(u64));
if (!(evtcount & DATA_VALID))
continue;
total += evtcount & DATA_BITS;
valid = true;
}
if (valid)
*val = total;
return valid ? 0 : -EINVAL;
}
void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r,
struct list_head *add_pos)
{
struct rdt_perf_pkg_mon_domain *d;
int err;
d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu));
if (!d)
return;
d->hdr.id = id;
d->hdr.type = RESCTRL_MON_DOMAIN;
d->hdr.rid = RDT_RESOURCE_PERF_PKG;
cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
list_add_tail_rcu(&d->hdr.list, add_pos);
err = resctrl_online_mon_domain(r, &d->hdr);
if (err) {
list_del_rcu(&d->hdr.list);
synchronize_rcu();
kfree(d);
}
}
|