summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/events/core.c230
1 files changed, 179 insertions, 51 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 95f118230ff5..6781d39f3158 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -165,7 +165,19 @@ enum event_type_t {
/* see ctx_resched() for details */
EVENT_CPU = 0x10,
EVENT_CGROUP = 0x20,
- EVENT_FLAGS = EVENT_CGROUP,
+
+ /*
+ * EVENT_GUEST is set when scheduling in/out events between the host
+ * and a guest with a mediated vPMU. Among other things, EVENT_GUEST
+ * is used:
+ *
+ * - In for_each_epc() to skip PMUs that don't support events in a
+ * MEDIATED_VPMU guest, i.e. don't need to be context switched.
+ * - To indicate the start/end point of the events in a guest. Guest
+ * running time is deducted for host-only (exclude_guest) events.
+ */
+ EVENT_GUEST = 0x40,
+ EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST,
/* compound helpers */
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
@@ -458,6 +470,11 @@ static cpumask_var_t perf_online_pkg_mask;
static cpumask_var_t perf_online_sys_mask;
static struct kmem_cache *perf_event_cache;
+static __always_inline bool is_guest_mediated_pmu_loaded(void)
+{
+ return false;
+}
+
/*
* perf event paranoia level:
* -1 - not paranoid at all
@@ -784,6 +801,9 @@ static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx,
{
if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups)
return true;
+ if ((event_type & EVENT_GUEST) &&
+ !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU))
+ return true;
return false;
}
@@ -834,6 +854,39 @@ static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, boo
WRITE_ONCE(time->offset, time->time - time->stamp);
}
+static_assert(offsetof(struct perf_event_context, timeguest) -
+ offsetof(struct perf_event_context, time) ==
+ sizeof(struct perf_time_ctx));
+
+#define T_TOTAL 0
+#define T_GUEST 1
+
+static inline u64 __perf_event_time_ctx(struct perf_event *event,
+ struct perf_time_ctx *times)
+{
+ u64 time = times[T_TOTAL].time;
+
+ if (event->attr.exclude_guest)
+ time -= times[T_GUEST].time;
+
+ return time;
+}
+
+static inline u64 __perf_event_time_ctx_now(struct perf_event *event,
+ struct perf_time_ctx *times,
+ u64 now)
+{
+ if (is_guest_mediated_pmu_loaded() && event->attr.exclude_guest) {
+ /*
+ * (now + times[total].offset) - (now + times[guest].offset) :=
+ * times[total].offset - times[guest].offset
+ */
+ return READ_ONCE(times[T_TOTAL].offset) - READ_ONCE(times[T_GUEST].offset);
+ }
+
+ return now + READ_ONCE(times[T_TOTAL].offset);
+}
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
@@ -870,12 +923,16 @@ static inline int is_cgroup_event(struct perf_event *event)
return event->cgrp != NULL;
}
+static_assert(offsetof(struct perf_cgroup_info, timeguest) -
+ offsetof(struct perf_cgroup_info, time) ==
+ sizeof(struct perf_time_ctx));
+
static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
struct perf_cgroup_info *t;
t = per_cpu_ptr(event->cgrp->info, event->cpu);
- return t->time.time;
+ return __perf_event_time_ctx(event, &t->time);
}
static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
@@ -884,9 +941,21 @@ static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
t = per_cpu_ptr(event->cgrp->info, event->cpu);
if (!__load_acquire(&t->active))
- return t->time.time;
- now += READ_ONCE(t->time.offset);
- return now;
+ return __perf_event_time_ctx(event, &t->time);
+
+ return __perf_event_time_ctx_now(event, &t->time, now);
+}
+
+static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv)
+{
+ update_perf_time_ctx(&info->timeguest, now, adv);
+}
+
+static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now)
+{
+ update_perf_time_ctx(&info->time, now, true);
+ if (is_guest_mediated_pmu_loaded())
+ __update_cgrp_guest_time(info, now, true);
}
static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
@@ -902,7 +971,7 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
cgrp = container_of(css, struct perf_cgroup, css);
info = this_cpu_ptr(cgrp->info);
- update_perf_time_ctx(&info->time, now, true);
+ update_cgrp_time(info, now);
if (final)
__store_release(&info->active, 0);
}
@@ -925,11 +994,11 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
* Do not update time when cgroup is not active
*/
if (info->active)
- update_perf_time_ctx(&info->time, perf_clock(), true);
+ update_cgrp_time(info, perf_clock());
}
static inline void
-perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest)
{
struct perf_event_context *ctx = &cpuctx->ctx;
struct perf_cgroup *cgrp = cpuctx->cgrp;
@@ -949,8 +1018,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
info = this_cpu_ptr(cgrp->info);
- update_perf_time_ctx(&info->time, ctx->time.stamp, false);
- __store_release(&info->active, 1);
+ if (guest) {
+ __update_cgrp_guest_time(info, ctx->time.stamp, false);
+ } else {
+ update_perf_time_ctx(&info->time, ctx->time.stamp, false);
+ __store_release(&info->active, 1);
+ }
}
}
@@ -1154,7 +1227,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
}
static inline void
-perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest)
{
}
@@ -1566,16 +1639,24 @@ static void perf_unpin_context(struct perf_event_context *ctx)
*/
static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
- u64 now = perf_clock();
+ lockdep_assert_held(&ctx->lock);
+
+ update_perf_time_ctx(&ctx->time, perf_clock(), adv);
+}
+static void __update_context_guest_time(struct perf_event_context *ctx, bool adv)
+{
lockdep_assert_held(&ctx->lock);
- update_perf_time_ctx(&ctx->time, now, adv);
+ /* must be called after __update_context_time(); */
+ update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv);
}
static void update_context_time(struct perf_event_context *ctx)
{
__update_context_time(ctx, true);
+ if (is_guest_mediated_pmu_loaded())
+ __update_context_guest_time(ctx, true);
}
static u64 perf_event_time(struct perf_event *event)
@@ -1588,7 +1669,7 @@ static u64 perf_event_time(struct perf_event *event)
if (is_cgroup_event(event))
return perf_cgroup_event_time(event);
- return ctx->time.time;
+ return __perf_event_time_ctx(event, &ctx->time);
}
static u64 perf_event_time_now(struct perf_event *event, u64 now)
@@ -1602,10 +1683,9 @@ static u64 perf_event_time_now(struct perf_event *event, u64 now)
return perf_cgroup_event_time_now(event, now);
if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
- return ctx->time.time;
+ return __perf_event_time_ctx(event, &ctx->time);
- now += READ_ONCE(ctx->time.offset);
- return now;
+ return __perf_event_time_ctx_now(event, &ctx->time, now);
}
static enum event_type_t get_event_type(struct perf_event *event)
@@ -2425,20 +2505,23 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
}
static inline void
-__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
+__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx,
+ bool final, enum event_type_t event_type)
{
if (ctx->is_active & EVENT_TIME) {
if (ctx->is_active & EVENT_FROZEN)
return;
+
update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx, final);
+ /* vPMU should not stop time */
+ update_cgrp_time_from_cpuctx(cpuctx, !(event_type & EVENT_GUEST) && final);
}
}
static inline void
ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
{
- __ctx_time_update(cpuctx, ctx, false);
+ __ctx_time_update(cpuctx, ctx, false, 0);
}
/*
@@ -3510,7 +3593,7 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
*
* would only update time for the pinned events.
*/
- __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
+ __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx, event_type);
/*
* CPU-release for the below ->is_active store,
@@ -3536,7 +3619,18 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
cpuctx->task_ctx = NULL;
}
- is_active ^= ctx->is_active; /* changed bits */
+ if (event_type & EVENT_GUEST) {
+ /*
+ * Schedule out all exclude_guest events of PMU
+ * with PERF_PMU_CAP_MEDIATED_VPMU.
+ */
+ is_active = EVENT_ALL;
+ __update_context_guest_time(ctx, false);
+ perf_cgroup_set_timestamp(cpuctx, true);
+ barrier();
+ } else {
+ is_active ^= ctx->is_active; /* changed bits */
+ }
for_each_epc(pmu_ctx, ctx, pmu, event_type)
__pmu_ctx_sched_out(pmu_ctx, is_active);
@@ -3995,10 +4089,15 @@ static inline void group_update_userpage(struct perf_event *group_event)
event_update_userpage(event);
}
+struct merge_sched_data {
+ int can_add_hw;
+ enum event_type_t event_type;
+};
+
static int merge_sched_in(struct perf_event *event, void *data)
{
struct perf_event_context *ctx = event->ctx;
- int *can_add_hw = data;
+ struct merge_sched_data *msd = data;
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
@@ -4006,13 +4105,22 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (!event_filter_match(event))
return 0;
- if (group_can_go_on(event, *can_add_hw)) {
+ /*
+ * Don't schedule in any host events from PMU with
+ * PERF_PMU_CAP_MEDIATED_VPMU, while a guest is running.
+ */
+ if (is_guest_mediated_pmu_loaded() &&
+ event->pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU &&
+ !(msd->event_type & EVENT_GUEST))
+ return 0;
+
+ if (group_can_go_on(event, msd->can_add_hw)) {
if (!group_sched_in(event, ctx))
list_add_tail(&event->active_list, get_event_list(event));
}
if (event->state == PERF_EVENT_STATE_INACTIVE) {
- *can_add_hw = 0;
+ msd->can_add_hw = 0;
if (event->attr.pinned) {
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
@@ -4035,11 +4143,15 @@ static int merge_sched_in(struct perf_event *event, void *data)
static void pmu_groups_sched_in(struct perf_event_context *ctx,
struct perf_event_groups *groups,
- struct pmu *pmu)
+ struct pmu *pmu,
+ enum event_type_t event_type)
{
- int can_add_hw = 1;
+ struct merge_sched_data msd = {
+ .can_add_hw = 1,
+ .event_type = event_type,
+ };
visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
- merge_sched_in, &can_add_hw);
+ merge_sched_in, &msd);
}
static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
@@ -4048,9 +4160,9 @@ static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
struct perf_event_context *ctx = pmu_ctx->ctx;
if (event_type & EVENT_PINNED)
- pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
+ pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu, event_type);
if (event_type & EVENT_FLEXIBLE)
- pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
+ pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu, event_type);
}
static void
@@ -4067,9 +4179,11 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
return;
if (!(is_active & EVENT_TIME)) {
+ /* EVENT_TIME should be active while the guest runs */
+ WARN_ON_ONCE(event_type & EVENT_GUEST);
/* start ctx time */
__update_context_time(ctx, false);
- perf_cgroup_set_timestamp(cpuctx);
+ perf_cgroup_set_timestamp(cpuctx, false);
/*
* CPU-release for the below ->is_active store,
* see __load_acquire() in perf_event_time_now()
@@ -4085,7 +4199,23 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
}
- is_active ^= ctx->is_active; /* changed bits */
+ if (event_type & EVENT_GUEST) {
+ /*
+ * Schedule in the required exclude_guest events of PMU
+ * with PERF_PMU_CAP_MEDIATED_VPMU.
+ */
+ is_active = event_type & EVENT_ALL;
+
+ /*
+ * Update ctx time to set the new start time for
+ * the exclude_guest events.
+ */
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx, false);
+ barrier();
+ } else {
+ is_active ^= ctx->is_active; /* changed bits */
+ }
/*
* First go through the list and put on any pinned groups
@@ -4093,13 +4223,13 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
*/
if (is_active & EVENT_PINNED) {
for_each_epc(pmu_ctx, ctx, pmu, event_type)
- __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED | (event_type & EVENT_GUEST));
}
/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE) {
for_each_epc(pmu_ctx, ctx, pmu, event_type)
- __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE | (event_type & EVENT_GUEST));
}
}
@@ -6627,22 +6757,22 @@ void perf_event_update_userpage(struct perf_event *event)
goto unlock;
/*
- * compute total_time_enabled, total_time_running
- * based on snapshot values taken when the event
- * was last scheduled in.
+ * Disable preemption to guarantee consistent time stamps are stored to
+ * the user page.
+ */
+ preempt_disable();
+
+ /*
+ * Compute total_time_enabled, total_time_running based on snapshot
+ * values taken when the event was last scheduled in.
*
- * we cannot simply called update_context_time()
- * because of locking issue as we can be called in
- * NMI context
+ * We cannot simply call update_context_time() because doing so would
+ * lead to deadlock when called from NMI context.
*/
calc_timer_values(event, &now, &enabled, &running);
userpg = rb->user_page;
- /*
- * Disable preemption to guarantee consistent time stamps are stored to
- * the user page.
- */
- preempt_disable();
+
++userpg->lock;
barrier();
userpg->index = perf_event_index(event);
@@ -7939,13 +8069,11 @@ static void perf_output_read(struct perf_output_handle *handle,
u64 read_format = event->attr.read_format;
/*
- * compute total_time_enabled, total_time_running
- * based on snapshot values taken when the event
- * was last scheduled in.
+ * Compute total_time_enabled, total_time_running based on snapshot
+ * values taken when the event was last scheduled in.
*
- * we cannot simply called update_context_time()
- * because of locking issue as we are called in
- * NMI context
+ * We cannot simply call update_context_time() because doing so would
+ * lead to deadlock when called from NMI context.
*/
if (read_format & PERF_FORMAT_TOTAL_TIMES)
calc_timer_values(event, &now, &enabled, &running);