From d79ddb069c5257a924456eb99b53fc1ea715c0a3 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:05 +0800 Subject: sched/psi: Move private helpers to sched/stats.h This patch move psi_task_change/psi_task_switch declarations out of PSI public header, since they are only needed for implementing the PSI stats tracking in sched/stats.h psi_task_switch is obvious, psi_task_change can't be public helper since it doesn't check psi_disabled static key. And there is no any user now, so put it in sched/stats.h too. Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-5-zhouchengming@bytedance.com --- include/linux/psi.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/psi.h b/include/linux/psi.h index dd74411ac21d..fffd229fbf19 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -18,10 +18,6 @@ extern struct psi_group psi_system; void psi_init(void); -void psi_task_change(struct task_struct *task, int clear, int set); -void psi_task_switch(struct task_struct *prev, struct task_struct *next, - bool sleep); - void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); -- cgit v1.2.3 From 71dbdde7914d32e86f01ac1f6e54e964c9dfdbd9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Aug 2022 00:41:07 +0800 Subject: sched/psi: Remove NR_ONCPU task accounting We put all fields updated by the scheduler in the first cacheline of struct psi_group_cpu for performance. Since we want add another PSI_IRQ_FULL to track IRQ/SOFTIRQ pressure, we need to reclaim space first. This patch remove NR_ONCPU task accounting in struct psi_group_cpu, use one bit in state_mask to track instead. Signed-off-by: Johannes Weiner Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chengming Zhou Tested-by: Chengming Zhou Link: https://lore.kernel.org/r/20220825164111.29534-7-zhouchengming@bytedance.com --- include/linux/psi_types.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index c7fe7c089718..54cb74946db4 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -15,13 +15,6 @@ enum psi_task_count { NR_IOWAIT, NR_MEMSTALL, NR_RUNNING, - /* - * This can't have values other than 0 or 1 and could be - * implemented as a bit flag. But for now we still have room - * in the first cacheline of psi_group_cpu, and this way we - * don't have to special case any state tracking for it. - */ - NR_ONCPU, /* * For IO and CPU stalls the presence of running/oncpu tasks * in the domain means a partial rather than a full stall. @@ -32,16 +25,18 @@ enum psi_task_count { * threads and memstall ones. */ NR_MEMSTALL_RUNNING, - NR_PSI_TASK_COUNTS = 5, + NR_PSI_TASK_COUNTS = 4, }; /* Task state bitmasks */ #define TSK_IOWAIT (1 << NR_IOWAIT) #define TSK_MEMSTALL (1 << NR_MEMSTALL) #define TSK_RUNNING (1 << NR_RUNNING) -#define TSK_ONCPU (1 << NR_ONCPU) #define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING) +/* Only one task can be scheduled, no corresponding task count */ +#define TSK_ONCPU (1 << NR_PSI_TASK_COUNTS) + /* Resources that workloads could be stalled on */ enum psi_res { PSI_IO, @@ -68,6 +63,9 @@ enum psi_states { NR_PSI_STATES = 7, }; +/* Use one bit in the state mask to track TSK_ONCPU */ +#define PSI_ONCPU (1 << NR_PSI_STATES) + enum psi_aggregators { PSI_AVGS = 0, PSI_POLL, -- cgit v1.2.3 From 52b1364ba0b105122d6de0e719b36db705011ac1 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:08 +0800 Subject: sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure Now PSI already tracked workload pressure stall information for CPU, memory and IO. Apart from these, IRQ/SOFTIRQ could have obvious impact on some workload productivity, such as web service workload. When CONFIG_IRQ_TIME_ACCOUNTING, we can get IRQ/SOFTIRQ delta time from update_rq_clock_task(), in which we can record that delta to CPU curr task's cgroups as PSI_IRQ_FULL status. Note we don't use PSI_IRQ_SOME since IRQ/SOFTIRQ always happen in the current task on the CPU, make nothing productive could run even if it were runnable, so we only use PSI_IRQ_FULL. Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-8-zhouchengming@bytedance.com --- include/linux/psi_types.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 54cb74946db4..40c28171cd91 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -42,7 +42,10 @@ enum psi_res { PSI_IO, PSI_MEM, PSI_CPU, - NR_PSI_RESOURCES = 3, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + PSI_IRQ, +#endif + NR_PSI_RESOURCES, }; /* @@ -58,9 +61,12 @@ enum psi_states { PSI_MEM_FULL, PSI_CPU_SOME, PSI_CPU_FULL, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + PSI_IRQ_FULL, +#endif /* Only per-CPU, to weigh the CPU in the global average: */ PSI_NONIDLE, - NR_PSI_STATES = 7, + NR_PSI_STATES, }; /* Use one bit in the state mask to track TSK_ONCPU */ -- cgit v1.2.3 From 57899a6610e67ba26fa3251ebbef4a5ed21efc5d Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:09 +0800 Subject: sched/psi: Consolidate cgroup_psi() cgroup_psi() can't return psi_group for root cgroup, so we have many open code "psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi". This patch move cgroup_psi() definition to , in which we can return psi_system for root cgroup, so can handle all cgroups. Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-9-zhouchengming@bytedance.com --- include/linux/cgroup.h | 5 ----- include/linux/psi.h | 6 ++++++ 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b0914aa26506..80cb970257be 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -673,11 +673,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); } -static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) -{ - return cgrp->psi; -} - bool cgroup_psi_enabled(void); static inline void cgroup_init_kthreadd(void) diff --git a/include/linux/psi.h b/include/linux/psi.h index fffd229fbf19..362a74ca1d3b 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -7,6 +7,7 @@ #include #include #include +#include struct seq_file; struct css_set; @@ -30,6 +31,11 @@ __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); #ifdef CONFIG_CGROUPS +static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) +{ + return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; +} + int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); void cgroup_move_task(struct task_struct *p, struct css_set *to); -- cgit v1.2.3 From dc86aba751e2867244411adda1562f6664747019 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:10 +0800 Subject: sched/psi: Cache parent psi_group to speed up group iteration We use iterate_groups() to iterate each level psi_group to update PSI stats, which is a very hot path. In current code, iterate_groups() have to use multiple branches and cgroup_parent() to get parent psi_group for each level, which is not very efficient. This patch cache parent psi_group in struct psi_group, only need to get psi_group of task itself first, then just use group->parent to iterate. Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-10-zhouchengming@bytedance.com --- include/linux/psi_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 40c28171cd91..a0b746258c68 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -151,6 +151,8 @@ struct psi_trigger { }; struct psi_group { + struct psi_group *parent; + /* Protects data used by the aggregator */ struct mutex avgs_lock; -- cgit v1.2.3 From 34f26a15611afb03c33df6819359d36f5b382589 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Wed, 7 Sep 2022 17:03:32 +0800 Subject: sched/psi: Per-cgroup PSI accounting disable/re-enable interface PSI accounts stalls for each cgroup separately and aggregates it at each level of the hierarchy. This may cause non-negligible overhead for some workloads when under deep level of the hierarchy. commit 3958e2d0c34e ("cgroup: make per-cgroup pressure stall tracking configurable") make PSI to skip per-cgroup stall accounting, only account system-wide to avoid this each level overhead. But for our use case, we also want leaf cgroup PSI stats accounted for userspace adjustment on that cgroup, apart from only system-wide adjustment. So this patch introduce a per-cgroup PSI accounting disable/re-enable interface "cgroup.pressure", which is a read-write single value file that allowed values are "0" and "1", the defaults is "1" so per-cgroup PSI stats is enabled by default. Implementation details: It should be relatively straight-forward to disable and re-enable state aggregation, time tracking, averaging on a per-cgroup level, if we can live with losing history from while it was disabled. I.e. the avgs will restart from 0, total= will have gaps. But it's hard or complex to stop/restart groupc->tasks[] updates, which is not implemented in this patch. So we always update groupc->tasks[] and PSI_ONCPU bit in psi_group_change() even when the cgroup PSI stats is disabled. Suggested-by: Johannes Weiner Suggested-by: Tejun Heo Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20220907090332.2078-1-zhouchengming@bytedance.com --- include/linux/cgroup-defs.h | 3 +++ include/linux/psi.h | 2 ++ include/linux/psi_types.h | 3 +++ 3 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4bcf56b3491c..7df76b318245 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -428,6 +428,9 @@ struct cgroup { struct cgroup_file procs_file; /* handle for "cgroup.procs" */ struct cgroup_file events_file; /* handle for "cgroup.events" */ + /* handles for "{cpu,memory,io,irq}.pressure" */ + struct cgroup_file psi_files[NR_PSI_RESOURCES]; + /* * The bitmask of subsystems enabled on the child cgroups. * ->subtree_control is the one configured through diff --git a/include/linux/psi.h b/include/linux/psi.h index 362a74ca1d3b..b029a847def1 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -39,6 +39,7 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); void cgroup_move_task(struct task_struct *p, struct css_set *to); +void psi_cgroup_restart(struct psi_group *group); #endif #else /* CONFIG_PSI */ @@ -60,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to) { rcu_assign_pointer(p->cgroups, to); } +static inline void psi_cgroup_restart(struct psi_group *group) {} #endif #endif /* CONFIG_PSI */ diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index a0b746258c68..6e4372735068 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -152,6 +152,7 @@ struct psi_trigger { struct psi_group { struct psi_group *parent; + bool enabled; /* Protects data used by the aggregator */ struct mutex avgs_lock; @@ -194,6 +195,8 @@ struct psi_group { #else /* CONFIG_PSI */ +#define NR_PSI_RESOURCES 0 + struct psi_group { }; #endif /* CONFIG_PSI */ -- cgit v1.2.3