diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-10 18:24:56 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-10 18:24:56 -0800 |
| commit | dcb49710189d104d4edc07709615748dab61341b (patch) | |
| tree | 439d5829e126e2201ba0387fbd4784dca8c0ca5e /arch/x86/kernel | |
| parent | 75b2a603671f9b649abe6b8e0d7662ea8c9f3c51 (diff) | |
| parent | a8848c4b43ad00c8a18db080206e3ffa53a08b91 (diff) | |
Merge tag 'x86_cache_for_v7.0_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 resource control updates from Borislav Petkov:
- Extend the resctrl machinery to support telemetry monitoring on
Intel (Tony Luck)
The practical usage of this is being able to tell how much energy or
how much work can be attributed to a group of tasks tracked under a
single idenitifier. Prepend this work with proper refactoring of
resctrl domains handling code.
* tag 'x86_cache_for_v7.0_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (30 commits)
x86,fs/resctrl: Update documentation for telemetry events
x86/resctrl: Enable RDT_RESOURCE_PERF_PKG
fs/resctrl: Move RMID initialization to first mount
x86,fs/resctrl: Compute number of RMIDs as minimum across resources
fs/resctrl: Move allocation/free of closid_num_dirty_rmid[]
x86/resctrl: Handle number of RMIDs supported by RDT_RESOURCE_PERF_PKG
x86/resctrl: Add energy/perf choices to rdt boot option
x86,fs/resctrl: Handle domain creation/deletion for RDT_RESOURCE_PERF_PKG
fs/resctrl: Refactor rmdir_mondata_subdir_allrdtgrp()
fs/resctrl: Refactor mkdir_mondata_subdir()
x86/resctrl: Read telemetry events
x86/resctrl: Find and enable usable telemetry events
x86,fs/resctrl: Add architectural event pointer
x86,fs/resctrl: Fill in details of events for performance and energy GUIDs
x86/resctrl: Discover hardware telemetry events
fs/resctrl: Emphasize that L3 monitoring resource is required for summing domains
x86,fs/resctrl: Add and initialize a resource for package scope monitoring
x86,fs/resctrl: Add an architectural hook called for first mount
x86,fs/resctrl: Support binary fixed point event counters
x86,fs/resctrl: Handle events that can be read from any CPU
...
Diffstat (limited to 'arch/x86/kernel')
| -rw-r--r-- | arch/x86/kernel/cpu/resctrl/Makefile | 1 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/resctrl/core.c | 224 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/resctrl/intel_aet.c | 409 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/resctrl/internal.h | 46 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/resctrl/monitor.c | 50 |
5 files changed, 628 insertions, 102 deletions
diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile index d8a04b195da2..273ddfa30836 100644 --- a/arch/x86/kernel/cpu/resctrl/Makefile +++ b/arch/x86/kernel/cpu/resctrl/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o +obj-$(CONFIG_X86_CPU_RESCTRL_INTEL_AET) += intel_aet.o obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o # To allow define_trace.h's recursive include: diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 6ebff44a3f75..7667cf7c4e94 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -100,14 +100,33 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { .schema_fmt = RESCTRL_SCHEMA_RANGE, }, }, + [RDT_RESOURCE_PERF_PKG] = + { + .r_resctrl = { + .name = "PERF_PKG", + .mon_scope = RESCTRL_PACKAGE, + .mon_domains = mon_domain_init(RDT_RESOURCE_PERF_PKG), + }, + }, }; +/** + * resctrl_arch_system_num_rmid_idx - Compute number of supported RMIDs + * (minimum across all mon_capable resource) + * + * Return: Number of supported RMIDs at time of call. Note that mount time + * enumeration of resources may reduce the number. + */ u32 resctrl_arch_system_num_rmid_idx(void) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + u32 num_rmids = U32_MAX; + struct rdt_resource *r; + + for_each_mon_capable_rdt_resource(r) + num_rmids = min(num_rmids, r->mon.num_rmid); /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return r->mon.num_rmid; + return num_rmids == U32_MAX ? 0 : num_rmids; } struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) @@ -368,7 +387,7 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) kfree(hw_dom); } -static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom) +static void l3_mon_domain_free(struct rdt_hw_l3_mon_domain *hw_dom) { int idx; @@ -401,11 +420,13 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain * } /** - * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters + * l3_mon_domain_mbm_alloc() - Allocate arch private storage for the MBM counters * @num_rmid: The size of the MBM counter array * @hw_dom: The domain that owns the allocated arrays + * + * Return: 0 for success, or -ENOMEM. */ -static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) +static int l3_mon_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_l3_mon_domain *hw_dom) { size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]); enum resctrl_event_id eventid; @@ -438,6 +459,8 @@ static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) return get_cpu_cacheinfo_id(cpu, scope); case RESCTRL_L3_NODE: return cpu_to_node(cpu); + case RESCTRL_PACKAGE: + return topology_physical_package_id(cpu); default: break; } @@ -464,7 +487,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos); if (hdr) { - if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) + if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid)) return; d = container_of(hdr, struct rdt_ctrl_domain, hdr); @@ -481,6 +504,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_CTRL_DOMAIN; + d->hdr.rid = r->rid; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); rdt_domain_reconfigure_cdp(r); @@ -500,37 +524,13 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) } } -static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) +static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct list_head *add_pos) { - int id = get_domain_id_from_scope(cpu, r->mon_scope); - struct list_head *add_pos = NULL; - struct rdt_hw_mon_domain *hw_dom; - struct rdt_domain_hdr *hdr; - struct rdt_mon_domain *d; + struct rdt_hw_l3_mon_domain *hw_dom; + struct rdt_l3_mon_domain *d; struct cacheinfo *ci; int err; - lockdep_assert_held(&domain_list_lock); - - if (id < 0) { - pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n", - cpu, r->mon_scope, r->name); - return; - } - - hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos); - if (hdr) { - if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) - return; - d = container_of(hdr, struct rdt_mon_domain, hdr); - - cpumask_set_cpu(cpu, &d->hdr.cpu_mask); - /* Update the mbm_assign_mode state for the CPU if supported */ - if (r->mon.mbm_cntr_assignable) - resctrl_arch_mbm_cntr_assign_set_one(r); - return; - } - hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu)); if (!hw_dom) return; @@ -538,33 +538,66 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_MON_DOMAIN; + d->hdr.rid = RDT_RESOURCE_L3; ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); if (!ci) { pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name); - mon_domain_free(hw_dom); + l3_mon_domain_free(hw_dom); return; } d->ci_id = ci->id; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); - /* Update the mbm_assign_mode state for the CPU if supported */ - if (r->mon.mbm_cntr_assignable) - resctrl_arch_mbm_cntr_assign_set_one(r); - arch_mon_domain_online(r, d); - if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { - mon_domain_free(hw_dom); + if (l3_mon_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { + l3_mon_domain_free(hw_dom); return; } list_add_tail_rcu(&d->hdr.list, add_pos); - err = resctrl_online_mon_domain(r, d); + err = resctrl_online_mon_domain(r, &d->hdr); if (err) { list_del_rcu(&d->hdr.list); synchronize_rcu(); - mon_domain_free(hw_dom); + l3_mon_domain_free(hw_dom); + } +} + +static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) +{ + int id = get_domain_id_from_scope(cpu, r->mon_scope); + struct list_head *add_pos = NULL; + struct rdt_domain_hdr *hdr; + + lockdep_assert_held(&domain_list_lock); + + if (id < 0) { + pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->mon_scope, r->name); + return; + } + + hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos); + if (hdr) + cpumask_set_cpu(cpu, &hdr->cpu_mask); + + switch (r->rid) { + case RDT_RESOURCE_L3: + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); + if (!hdr) + l3_mon_domain_setup(cpu, id, r, add_pos); + break; + case RDT_RESOURCE_PERF_PKG: + if (!hdr) + intel_aet_mon_domain_setup(cpu, id, r, add_pos); + break; + default: + pr_warn_once("Unknown resource rid=%d\n", r->rid); + break; } } @@ -598,36 +631,33 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) return; } - if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) + cpumask_clear_cpu(cpu, &hdr->cpu_mask); + if (!cpumask_empty(&hdr->cpu_mask)) + return; + + if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid)) return; d = container_of(hdr, struct rdt_ctrl_domain, hdr); hw_dom = resctrl_to_arch_ctrl_dom(d); - cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); - if (cpumask_empty(&d->hdr.cpu_mask)) { - resctrl_offline_ctrl_domain(r, d); - list_del_rcu(&d->hdr.list); - synchronize_rcu(); - - /* - * rdt_ctrl_domain "d" is going to be freed below, so clear - * its pointer from pseudo_lock_region struct. - */ - if (d->plr) - d->plr->d = NULL; - ctrl_domain_free(hw_dom); + resctrl_offline_ctrl_domain(r, d); + list_del_rcu(&hdr->list); + synchronize_rcu(); - return; - } + /* + * rdt_ctrl_domain "d" is going to be freed below, so clear + * its pointer from pseudo_lock_region struct. + */ + if (d->plr) + d->plr->d = NULL; + ctrl_domain_free(hw_dom); } static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) { int id = get_domain_id_from_scope(cpu, r->mon_scope); - struct rdt_hw_mon_domain *hw_dom; struct rdt_domain_hdr *hdr; - struct rdt_mon_domain *d; lockdep_assert_held(&domain_list_lock); @@ -644,20 +674,42 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) return; } - if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) + cpumask_clear_cpu(cpu, &hdr->cpu_mask); + if (!cpumask_empty(&hdr->cpu_mask)) return; - d = container_of(hdr, struct rdt_mon_domain, hdr); - hw_dom = resctrl_to_arch_mon_dom(d); + switch (r->rid) { + case RDT_RESOURCE_L3: { + struct rdt_hw_l3_mon_domain *hw_dom; + struct rdt_l3_mon_domain *d; - cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); - if (cpumask_empty(&d->hdr.cpu_mask)) { - resctrl_offline_mon_domain(r, d); - list_del_rcu(&d->hdr.list); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return; + + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); + hw_dom = resctrl_to_arch_mon_dom(d); + resctrl_offline_mon_domain(r, hdr); + list_del_rcu(&hdr->list); synchronize_rcu(); - mon_domain_free(hw_dom); + l3_mon_domain_free(hw_dom); + break; + } + case RDT_RESOURCE_PERF_PKG: { + struct rdt_perf_pkg_mon_domain *pkgd; - return; + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_PERF_PKG)) + return; + + pkgd = container_of(hdr, struct rdt_perf_pkg_mon_domain, hdr); + resctrl_offline_mon_domain(r, hdr); + list_del_rcu(&hdr->list); + synchronize_rcu(); + kfree(pkgd); + break; + } + default: + pr_warn_once("Unknown resource rid=%d\n", r->rid); + break; } } @@ -712,6 +764,28 @@ static int resctrl_arch_offline_cpu(unsigned int cpu) return 0; } +void resctrl_arch_pre_mount(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl; + int cpu; + + if (!intel_aet_get_events()) + return; + + /* + * Late discovery of telemetry events means the domains for the + * resource were not built. Do that now. + */ + cpus_read_lock(); + mutex_lock(&domain_list_lock); + r->mon_capable = true; + rdt_mon_capable = true; + for_each_online_cpu(cpu) + domain_add_cpu_mon(cpu, r); + mutex_unlock(&domain_list_lock); + cpus_read_unlock(); +} + enum { RDT_FLAG_CMT, RDT_FLAG_MBM_TOTAL, @@ -767,6 +841,8 @@ static int __init set_rdt_options(char *str) force_off = *tok == '!'; if (force_off) tok++; + if (intel_handle_aet_option(force_off, tok)) + continue; for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { if (strcmp(tok, o->name) == 0) { if (force_off) @@ -889,15 +965,15 @@ static __init bool get_rdt_mon_resources(void) bool ret = false; if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { - resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID); + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false, 0, NULL); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false, 0, NULL); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false, 0, NULL); ret = true; } if (rdt_cpu_has(X86_FEATURE_ABMC)) @@ -906,7 +982,7 @@ static __init bool get_rdt_mon_resources(void) if (!ret) return false; - return !rdt_get_mon_l3_config(r); + return !rdt_get_l3_mon_config(r); } static __init void __check_quirks_intel(void) @@ -1084,6 +1160,8 @@ late_initcall(resctrl_arch_late_init); static void __exit resctrl_arch_exit(void) { + intel_aet_exit(); + cpuhp_remove_state(rdt_online); resctrl_exit(); diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c new file mode 100644 index 000000000000..89b8b619d5d5 --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -0,0 +1,409 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resource Director Technology(RDT) + * - Intel Application Energy Telemetry + * + * Copyright (C) 2025 Intel Corporation + * + * Author: + * Tony Luck <tony.luck@intel.com> + */ + +#define pr_fmt(fmt) "resctrl: " fmt + +#include <linux/bits.h> +#include <linux/compiler_types.h> +#include <linux/container_of.h> +#include <linux/cpumask.h> +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/gfp_types.h> +#include <linux/init.h> +#include <linux/intel_pmt_features.h> +#include <linux/intel_vsec.h> +#include <linux/io.h> +#include <linux/minmax.h> +#include <linux/printk.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/resctrl.h> +#include <linux/resctrl_types.h> +#include <linux/slab.h> +#include <linux/stddef.h> +#include <linux/topology.h> +#include <linux/types.h> + +#include "internal.h" + +/** + * struct pmt_event - Telemetry event. + * @id: Resctrl event id. + * @idx: Counter index within each per-RMID block of counters. + * @bin_bits: Zero for integer valued events, else number bits in fraction + * part of fixed-point. + */ +struct pmt_event { + enum resctrl_event_id id; + unsigned int idx; + unsigned int bin_bits; +}; + +#define EVT(_id, _idx, _bits) { .id = _id, .idx = _idx, .bin_bits = _bits } + +/** + * struct event_group - Events with the same feature type ("energy" or "perf") and GUID. + * @pfname: PMT feature name ("energy" or "perf") of this event group. + * Used by boot rdt= option. + * @pfg: Points to the aggregated telemetry space information + * returned by the intel_pmt_get_regions_by_feature() + * call to the INTEL_PMT_TELEMETRY driver that contains + * data for all telemetry regions of type @pfname. + * Valid if the system supports the event group, + * NULL otherwise. + * @force_off: True when "rdt" command line or architecture code disables + * this event group due to insufficient RMIDs. + * @force_on: True when "rdt" command line overrides disable of this + * event group. + * @guid: Unique number per XML description file. + * @num_rmid: Number of RMIDs supported by this group. May be + * adjusted downwards if enumeration from + * intel_pmt_get_regions_by_feature() indicates fewer + * RMIDs can be tracked simultaneously. + * @mmio_size: Number of bytes of MMIO registers for this group. + * @num_events: Number of events in this group. + * @evts: Array of event descriptors. + */ +struct event_group { + /* Data fields for additional structures to manage this group. */ + const char *pfname; + struct pmt_feature_group *pfg; + bool force_off, force_on; + + /* Remaining fields initialized from XML file. */ + u32 guid; + u32 num_rmid; + size_t mmio_size; + unsigned int num_events; + struct pmt_event evts[] __counted_by(num_events); +}; + +#define XML_MMIO_SIZE(num_rmids, num_events, num_extra_status) \ + (((num_rmids) * (num_events) + (num_extra_status)) * sizeof(u64)) + +/* + * Link: https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-ENERGY/cwf_aggregator.xml + */ +static struct event_group energy_0x26696143 = { + .pfname = "energy", + .guid = 0x26696143, + .num_rmid = 576, + .mmio_size = XML_MMIO_SIZE(576, 2, 3), + .num_events = 2, + .evts = { + EVT(PMT_EVENT_ENERGY, 0, 18), + EVT(PMT_EVENT_ACTIVITY, 1, 18), + } +}; + +/* + * Link: https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-PERF/cwf_aggregator.xml + */ +static struct event_group perf_0x26557651 = { + .pfname = "perf", + .guid = 0x26557651, + .num_rmid = 576, + .mmio_size = XML_MMIO_SIZE(576, 7, 3), + .num_events = 7, + .evts = { + EVT(PMT_EVENT_STALLS_LLC_HIT, 0, 0), + EVT(PMT_EVENT_C1_RES, 1, 0), + EVT(PMT_EVENT_UNHALTED_CORE_CYCLES, 2, 0), + EVT(PMT_EVENT_STALLS_LLC_MISS, 3, 0), + EVT(PMT_EVENT_AUTO_C6_RES, 4, 0), + EVT(PMT_EVENT_UNHALTED_REF_CYCLES, 5, 0), + EVT(PMT_EVENT_UOPS_RETIRED, 6, 0), + } +}; + +static struct event_group *known_event_groups[] = { + &energy_0x26696143, + &perf_0x26557651, +}; + +#define for_each_event_group(_peg) \ + for (_peg = known_event_groups; \ + _peg < &known_event_groups[ARRAY_SIZE(known_event_groups)]; \ + _peg++) + +bool intel_handle_aet_option(bool force_off, char *tok) +{ + struct event_group **peg; + bool ret = false; + u32 guid = 0; + char *name; + + if (!tok) + return false; + + name = strsep(&tok, ":"); + if (tok && kstrtou32(tok, 16, &guid)) + return false; + + for_each_event_group(peg) { + if (strcmp(name, (*peg)->pfname)) + continue; + if (guid && (*peg)->guid != guid) + continue; + if (force_off) + (*peg)->force_off = true; + else + (*peg)->force_on = true; + ret = true; + } + + return ret; +} + +static bool skip_telem_region(struct telemetry_region *tr, struct event_group *e) +{ + if (tr->guid != e->guid) + return true; + if (tr->plat_info.package_id >= topology_max_packages()) { + pr_warn("Bad package %u in guid 0x%x\n", tr->plat_info.package_id, + tr->guid); + return true; + } + if (tr->size != e->mmio_size) { + pr_warn("MMIO space wrong size (%zu bytes) for guid 0x%x. Expected %zu bytes.\n", + tr->size, e->guid, e->mmio_size); + return true; + } + + return false; +} + +static bool group_has_usable_regions(struct event_group *e, struct pmt_feature_group *p) +{ + bool usable_regions = false; + + for (int i = 0; i < p->count; i++) { + if (skip_telem_region(&p->regions[i], e)) { + /* + * Clear the address field of regions that did not pass the checks in + * skip_telem_region() so they will not be used by intel_aet_read_event(). + * This is safe to do because intel_pmt_get_regions_by_feature() allocates + * a new pmt_feature_group structure to return to each caller and only makes + * use of the pmt_feature_group::kref field when intel_pmt_put_feature_group() + * returns the structure. + */ + p->regions[i].addr = NULL; + + continue; + } + usable_regions = true; + } + + return usable_regions; +} + +static bool all_regions_have_sufficient_rmid(struct event_group *e, struct pmt_feature_group *p) +{ + struct telemetry_region *tr; + + for (int i = 0; i < p->count; i++) { + if (!p->regions[i].addr) + continue; + tr = &p->regions[i]; + if (tr->num_rmids < e->num_rmid) { + e->force_off = true; + return false; + } + } + + return true; +} + +static bool enable_events(struct event_group *e, struct pmt_feature_group *p) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl; + int skipped_events = 0; + + if (e->force_off) + return false; + + if (!group_has_usable_regions(e, p)) + return false; + + /* + * Only enable event group with insufficient RMIDs if the user requested + * it from the kernel command line. + */ + if (!all_regions_have_sufficient_rmid(e, p) && !e->force_on) { + pr_info("%s %s:0x%x monitoring not enabled due to insufficient RMIDs\n", + r->name, e->pfname, e->guid); + return false; + } + + for (int i = 0; i < p->count; i++) { + if (!p->regions[i].addr) + continue; + /* + * e->num_rmid only adjusted lower if user (via rdt= kernel + * parameter) forces an event group with insufficient RMID + * to be enabled. + */ + e->num_rmid = min(e->num_rmid, p->regions[i].num_rmids); + } + + for (int j = 0; j < e->num_events; j++) { + if (!resctrl_enable_mon_event(e->evts[j].id, true, + e->evts[j].bin_bits, &e->evts[j])) + skipped_events++; + } + if (e->num_events == skipped_events) { + pr_info("No events enabled in %s %s:0x%x\n", r->name, e->pfname, e->guid); + return false; + } + + if (r->mon.num_rmid) + r->mon.num_rmid = min(r->mon.num_rmid, e->num_rmid); + else + r->mon.num_rmid = e->num_rmid; + + if (skipped_events) + pr_info("%s %s:0x%x monitoring detected (skipped %d events)\n", r->name, + e->pfname, e->guid, skipped_events); + else + pr_info("%s %s:0x%x monitoring detected\n", r->name, e->pfname, e->guid); + + return true; +} + +static enum pmt_feature_id lookup_pfid(const char *pfname) +{ + if (!strcmp(pfname, "energy")) + return FEATURE_PER_RMID_ENERGY_TELEM; + else if (!strcmp(pfname, "perf")) + return FEATURE_PER_RMID_PERF_TELEM; + + pr_warn("Unknown PMT feature name '%s'\n", pfname); + + return FEATURE_INVALID; +} + +/* + * Request a copy of struct pmt_feature_group for each event group. If there is + * one, the returned structure has an array of telemetry_region structures, + * each element of the array describes one telemetry aggregator. The + * telemetry aggregators may have different GUIDs so obtain duplicate struct + * pmt_feature_group for event groups with same feature type but different + * GUID. Post-processing ensures an event group can only use the telemetry + * aggregators that match its GUID. An event group keeps a pointer to its + * struct pmt_feature_group to indicate that its events are successfully + * enabled. + */ +bool intel_aet_get_events(void) +{ + struct pmt_feature_group *p; + enum pmt_feature_id pfid; + struct event_group **peg; + bool ret = false; + + for_each_event_group(peg) { + pfid = lookup_pfid((*peg)->pfname); + p = intel_pmt_get_regions_by_feature(pfid); + if (IS_ERR_OR_NULL(p)) + continue; + if (enable_events(*peg, p)) { + (*peg)->pfg = p; + ret = true; + } else { + intel_pmt_put_feature_group(p); + } + } + + return ret; +} + +void __exit intel_aet_exit(void) +{ + struct event_group **peg; + + for_each_event_group(peg) { + if ((*peg)->pfg) { + intel_pmt_put_feature_group((*peg)->pfg); + (*peg)->pfg = NULL; + } + } +} + +#define DATA_VALID BIT_ULL(63) +#define DATA_BITS GENMASK_ULL(62, 0) + +/* + * Read counter for an event on a domain (summing all aggregators on the + * domain). If an aggregator hasn't received any data for a specific RMID, + * the MMIO read indicates that data is not valid. Return success if at + * least one aggregator has valid data. + */ +int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val) +{ + struct pmt_event *pevt = arch_priv; + struct event_group *e; + bool valid = false; + u64 total = 0; + u64 evtcount; + void *pevt0; + u32 idx; + + pevt0 = pevt - pevt->idx; + e = container_of(pevt0, struct event_group, evts); + idx = rmid * e->num_events; + idx += pevt->idx; + + if (idx * sizeof(u64) + sizeof(u64) > e->mmio_size) { + pr_warn_once("MMIO index %u out of range\n", idx); + return -EIO; + } + + for (int i = 0; i < e->pfg->count; i++) { + if (!e->pfg->regions[i].addr) + continue; + if (e->pfg->regions[i].plat_info.package_id != domid) + continue; + evtcount = readq(e->pfg->regions[i].addr + idx * sizeof(u64)); + if (!(evtcount & DATA_VALID)) + continue; + total += evtcount & DATA_BITS; + valid = true; + } + + if (valid) + *val = total; + + return valid ? 0 : -EINVAL; +} + +void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, + struct list_head *add_pos) +{ + struct rdt_perf_pkg_mon_domain *d; + int err; + + d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu)); + if (!d) + return; + + d->hdr.id = id; + d->hdr.type = RESCTRL_MON_DOMAIN; + d->hdr.rid = RDT_RESOURCE_PERF_PKG; + cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + list_add_tail_rcu(&d->hdr.list, add_pos); + + err = resctrl_online_mon_domain(r, &d->hdr); + if (err) { + list_del_rcu(&d->hdr.list); + synchronize_rcu(); + kfree(d); + } +} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 79c18657ede0..e3cfa0c10e92 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -66,17 +66,17 @@ struct rdt_hw_ctrl_domain { }; /** - * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share - * a resource for a monitor function - * @d_resctrl: Properties exposed to the resctrl file system + * struct rdt_hw_l3_mon_domain - Arch private attributes of a set of CPUs sharing + * RDT_RESOURCE_L3 monitoring + * @d_resctrl: Properties exposed to the resctrl file system * @arch_mbm_states: Per-event pointer to the MBM event's saved state. * An MBM event's state is an array of struct arch_mbm_state * indexed by RMID on x86. * * Members of this structure are accessed via helpers that provide abstraction. */ -struct rdt_hw_mon_domain { - struct rdt_mon_domain d_resctrl; +struct rdt_hw_l3_mon_domain { + struct rdt_l3_mon_domain d_resctrl; struct arch_mbm_state *arch_mbm_states[QOS_NUM_L3_MBM_EVENTS]; }; @@ -85,12 +85,20 @@ static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctr return container_of(r, struct rdt_hw_ctrl_domain, d_resctrl); } -static inline struct rdt_hw_mon_domain *resctrl_to_arch_mon_dom(struct rdt_mon_domain *r) +static inline struct rdt_hw_l3_mon_domain *resctrl_to_arch_mon_dom(struct rdt_l3_mon_domain *r) { - return container_of(r, struct rdt_hw_mon_domain, d_resctrl); + return container_of(r, struct rdt_hw_l3_mon_domain, d_resctrl); } /** + * struct rdt_perf_pkg_mon_domain - CPUs sharing an package scoped resctrl monitor resource + * @hdr: common header for different domain types + */ +struct rdt_perf_pkg_mon_domain { + struct rdt_domain_hdr hdr; +}; + +/** * struct msr_param - set a range of MSRs from a domain * @res: The resource to use * @dom: The domain to update @@ -143,7 +151,7 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r extern struct rdt_hw_resource rdt_resources_all[]; -void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d); +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_l3_mon_domain *d); /* CPUID.(EAX=10H, ECX=ResID=1).EAX */ union cpuid_0x10_1_eax { @@ -216,7 +224,7 @@ union l3_qos_abmc_cfg { void rdt_ctrl_update(void *arg); -int rdt_get_mon_l3_config(struct rdt_resource *r); +int rdt_get_l3_mon_config(struct rdt_resource *r); bool rdt_cpu_has(int flag); @@ -225,4 +233,24 @@ void __init intel_rdt_mbm_apply_quirk(void); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r); +#ifdef CONFIG_X86_CPU_RESCTRL_INTEL_AET +bool intel_aet_get_events(void); +void __exit intel_aet_exit(void); +int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val); +void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, + struct list_head *add_pos); +bool intel_handle_aet_option(bool force_off, char *tok); +#else +static inline bool intel_aet_get_events(void) { return false; } +static inline void __exit intel_aet_exit(void) { } +static inline int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val) +{ + return -EINVAL; +} + +static inline void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, + struct list_head *add_pos) { } +static inline bool intel_handle_aet_option(bool force_off, char *tok) { return false; } +#endif + #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index dffcc8307500..e6a154240b8d 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -109,7 +109,7 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) * * In RMID sharing mode there are fewer "logical RMID" values available * to accumulate data ("physical RMIDs" are divided evenly between SNC - * nodes that share an L3 cache). Linux creates an rdt_mon_domain for + * nodes that share an L3 cache). Linux creates an rdt_l3_mon_domain for * each SNC node. * * The value loaded into IA32_PQR_ASSOC is the "logical RMID". @@ -157,7 +157,7 @@ static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) return 0; } -static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom, +static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_l3_mon_domain *hw_dom, u32 rmid, enum resctrl_event_id eventid) { @@ -171,11 +171,11 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do return state ? &state[rmid] : NULL; } -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, enum resctrl_event_id eventid) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); int cpu = cpumask_any(&d->hdr.cpu_mask); struct arch_mbm_state *am; u32 prmid; @@ -194,9 +194,9 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, * Assumes that hardware counters are also reset and thus that there is * no need to record initial non-zero counts. */ -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); enum resctrl_event_id eventid; int idx; @@ -217,10 +217,10 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) return chunks >> shift; } -static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, +static u64 get_corrected_val(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 rmid, enum resctrl_event_id eventid, u64 msr_val) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct arch_mbm_state *am; u64 chunks; @@ -238,19 +238,29 @@ static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, return chunks * hw_res->mon_scale; } -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 unused, u32 rmid, enum resctrl_event_id eventid, - u64 *val, void *ignored) + void *arch_priv, u64 *val, void *ignored) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - int cpu = cpumask_any(&d->hdr.cpu_mask); + struct rdt_hw_l3_mon_domain *hw_dom; + struct rdt_l3_mon_domain *d; struct arch_mbm_state *am; u64 msr_val; u32 prmid; + int cpu; int ret; resctrl_arch_rmid_read_context_check(); + if (r->rid == RDT_RESOURCE_PERF_PKG) + return intel_aet_read_event(hdr->id, rmid, arch_priv, val); + + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return -EINVAL; + + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); + hw_dom = resctrl_to_arch_mon_dom(d); + cpu = cpumask_any(&hdr->cpu_mask); prmid = logical_rmid_to_physical_rmid(cpu, rmid); ret = __rmid_read_phys(prmid, eventid, &msr_val); @@ -302,11 +312,11 @@ static int __cntr_id_read(u32 cntr_id, u64 *val) return 0; } -void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, int cntr_id, enum resctrl_event_id eventid) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct arch_mbm_state *am; am = get_arch_mbm_state(hw_dom, rmid, eventid); @@ -318,7 +328,7 @@ void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, } } -int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, int cntr_id, enum resctrl_event_id eventid, u64 *val) { @@ -348,7 +358,7 @@ int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, * must adjust RMID counter numbers based on SNC node. See * logical_rmid_to_physical_rmid() for code that does this. */ -void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { if (snc_nodes_per_l3_cache > 1) msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); @@ -418,7 +428,7 @@ static __init int snc_get_config(void) return ret; } -int __init rdt_get_mon_l3_config(struct rdt_resource *r) +int __init rdt_get_l3_mon_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); @@ -510,7 +520,7 @@ static void resctrl_abmc_set_one_amd(void *arg) */ static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; lockdep_assert_cpus_held(); @@ -549,11 +559,11 @@ static void resctrl_abmc_config_one_amd(void *info) /* * Send an IPI to the domain to assign the counter to RMID, event pair. */ -void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); union l3_qos_abmc_cfg abmc_cfg = { 0 }; struct arch_mbm_state *am; |
