From eb414681d5a07d28d2ff90dc05f69ec6b232ebd2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:27 -0700 Subject: psi: pressure stall information for CPU, memory, and IO When systems are overcommitted and resources become contended, it's hard to tell exactly the impact this has on workload productivity, or how close the system is to lockups and OOM kills. In particular, when machines work multiple jobs concurrently, the impact of overcommit in terms of latency and throughput on the individual job can be enormous. In order to maximize hardware utilization without sacrificing individual job health or risk complete machine lockups, this patch implements a way to quantify resource pressure in the system. A kernel built with CONFIG_PSI=y creates files in /proc/pressure/ that expose the percentage of time the system is stalled on CPU, memory, or IO, respectively. Stall states are aggregate versions of the per-task delay accounting delays: cpu: some tasks are runnable but not executing on a CPU memory: tasks are reclaiming, or waiting for swapin or thrashing cache io: tasks are waiting for io completions These percentages of walltime can be thought of as pressure percentages, and they give a general sense of system health and productivity loss incurred by resource overcommit. They can also indicate when the system is approaching lockup scenarios and OOMs. To do this, psi keeps track of the task states associated with each CPU and samples the time they spend in stall states. Every 2 seconds, the samples are averaged across CPUs - weighted by the CPUs' non-idle time to eliminate artifacts from unused CPUs - and translated into percentages of walltime. A running average of those percentages is maintained over 10s, 1m, and 5m periods (similar to the loadaverage). [hannes@cmpxchg.org: doc fixlet, per Randy] Link: http://lkml.kernel.org/r/20180828205625.GA14030@cmpxchg.org [hannes@cmpxchg.org: code optimization] Link: http://lkml.kernel.org/r/20180907175015.GA8479@cmpxchg.org [hannes@cmpxchg.org: rename psi_clock() to psi_update_work(), per Peter] Link: http://lkml.kernel.org/r/20180907145404.GB11088@cmpxchg.org [hannes@cmpxchg.org: fix build] Link: http://lkml.kernel.org/r/20180913014222.GA2370@cmpxchg.org Link: http://lkml.kernel.org/r/20180828172258.3185-9-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Daniel Drake Tested-by: Suren Baghdasaryan Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/psi.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 include/linux/psi.h (limited to 'include/linux/psi.h') diff --git a/include/linux/psi.h b/include/linux/psi.h new file mode 100644 index 000000000000..b0daf050de58 --- /dev/null +++ b/include/linux/psi.h @@ -0,0 +1,28 @@ +#ifndef _LINUX_PSI_H +#define _LINUX_PSI_H + +#include +#include + +#ifdef CONFIG_PSI + +extern bool psi_disabled; + +void psi_init(void); + +void psi_task_change(struct task_struct *task, int clear, int set); + +void psi_memstall_tick(struct task_struct *task, int cpu); +void psi_memstall_enter(unsigned long *flags); +void psi_memstall_leave(unsigned long *flags); + +#else /* CONFIG_PSI */ + +static inline void psi_init(void) {} + +static inline void psi_memstall_enter(unsigned long *flags) {} +static inline void psi_memstall_leave(unsigned long *flags) {} + +#endif /* CONFIG_PSI */ + +#endif /* _LINUX_PSI_H */ -- cgit v1.2.3 From 2ce7135adc9ad081aa3c49744144376ac74fea60 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:31 -0700 Subject: psi: cgroup support On a system that executes multiple cgrouped jobs and independent workloads, we don't just care about the health of the overall system, but also that of individual jobs, so that we can ensure individual job health, fairness between jobs, or prioritize some jobs over others. This patch implements pressure stall tracking for cgroups. In kernels with CONFIG_PSI=y, cgroup2 groups will have cpu.pressure, memory.pressure, and io.pressure files that track aggregate pressure stall times for only the tasks inside the cgroup. Link: http://lkml.kernel.org/r/20180828172258.3185-10-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Acked-by: Peter Zijlstra (Intel) Tested-by: Daniel Drake Tested-by: Suren Baghdasaryan Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/psi.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux/psi.h') diff --git a/include/linux/psi.h b/include/linux/psi.h index b0daf050de58..8e0725aac0aa 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -4,6 +4,9 @@ #include #include +struct seq_file; +struct css_set; + #ifdef CONFIG_PSI extern bool psi_disabled; @@ -16,6 +19,14 @@ void psi_memstall_tick(struct task_struct *task, int cpu); void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); +int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); + +#ifdef CONFIG_CGROUPS +int psi_cgroup_alloc(struct cgroup *cgrp); +void psi_cgroup_free(struct cgroup *cgrp); +void cgroup_move_task(struct task_struct *p, struct css_set *to); +#endif + #else /* CONFIG_PSI */ static inline void psi_init(void) {} @@ -23,6 +34,20 @@ static inline void psi_init(void) {} static inline void psi_memstall_enter(unsigned long *flags) {} static inline void psi_memstall_leave(unsigned long *flags) {} +#ifdef CONFIG_CGROUPS +static inline int psi_cgroup_alloc(struct cgroup *cgrp) +{ + return 0; +} +static inline void psi_cgroup_free(struct cgroup *cgrp) +{ +} +static inline void cgroup_move_task(struct task_struct *p, struct css_set *to) +{ + rcu_assign_pointer(p->cgroups, to); +} +#endif + #endif /* CONFIG_PSI */ #endif /* _LINUX_PSI_H */ -- cgit v1.2.3