From f0c867d9588d9efc10d6a55009c9560336673369 Mon Sep 17 00:00:00 2001 From: yuzhoujian Date: Fri, 28 Dec 2018 00:36:10 -0800 Subject: mm, oom: add oom victim's memcg to the oom context information The current oom report doesn't display victim's memcg context during the global OOM situation. While this information is not strictly needed, it can be really helpful for containerized environments to locate which container has lost a process. Now that we have a single line for the oom context, we can trivially add both the oom memcg (this can be either global_oom or a specific memcg which hits its hard limits) and task_memcg which is the victim's memcg. Below is the single line output in the oom report after this patch. - global oom context information: oom-kill:constraint=,nodemask=,cpuset=,mems_allowed=,global_oom,task_memcg=,task=,pid=,uid= - memcg oom context information: oom-kill:constraint=,nodemask=,cpuset=,mems_allowed=,oom_memcg=,task_memcg=,task=,pid=,uid= [penguin-kernel@I-love.SAKURA.ne.jp: use pr_cont() in mem_cgroup_print_oom_context()] Link: http://lkml.kernel.org/r/201812190723.wBJ7NdkN032628@www262.sakura.ne.jp Link: http://lkml.kernel.org/r/1542799799-36184-2-git-send-email-ufo19890607@gmail.com Signed-off-by: yuzhoujian Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Cc: David Rientjes Cc: "Kirill A . Shutemov" Cc: Andrea Arcangeli Cc: Tetsuo Handa Cc: Roman Gushchin Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7ab2120155a4..83ae11cbd12c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -526,9 +526,11 @@ void mem_cgroup_handle_over_high(void); unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); -void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, +void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p); +void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg); + static inline void mem_cgroup_enter_user_fault(void) { WARN_ON(current->in_user_fault); @@ -970,7 +972,12 @@ static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) } static inline void -mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) +mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) +{ +} + +static inline void +mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { } -- cgit v1.2.3 From 60cd4bcd62384cfa1e5890cebacccf08b3161156 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 5 Mar 2019 15:43:13 -0800 Subject: memcg: localize memcg_kmem_enabled() check Move the memcg_kmem_enabled() checks into memcg kmem charge/uncharge functions, so, the users don't have to explicitly check that condition. This is purely code cleanup patch without any functional change. Only the order of checks in memcg_charge_slab() can potentially be changed but the functionally it will be same. This should not matter as memcg_charge_slab() is not in the hot path. Link: http://lkml.kernel.org/r/20190103161203.162375-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 83ae11cbd12c..b0eb29ea0d9c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1273,12 +1273,12 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); void memcg_kmem_put_cache(struct kmem_cache *cachep); -int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, - struct mem_cgroup *memcg); #ifdef CONFIG_MEMCG_KMEM -int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); -void memcg_kmem_uncharge(struct page *page, int order); +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); +void __memcg_kmem_uncharge(struct page *page, int order); +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg); extern struct static_key_false memcg_kmem_enabled_key; extern struct workqueue_struct *memcg_kmem_cache_wq; @@ -1300,6 +1300,26 @@ static inline bool memcg_kmem_enabled(void) return static_branch_unlikely(&memcg_kmem_enabled_key); } +static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +{ + if (memcg_kmem_enabled()) + return __memcg_kmem_charge(page, gfp, order); + return 0; +} + +static inline void memcg_kmem_uncharge(struct page *page, int order) +{ + if (memcg_kmem_enabled()) + __memcg_kmem_uncharge(page, order); +} + +static inline int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, + int order, struct mem_cgroup *memcg) +{ + if (memcg_kmem_enabled()) + return __memcg_kmem_charge_memcg(page, gfp, order, memcg); + return 0; +} /* * helper for accessing a memcg's index. It will be used as an index in the * child cache array in kmem_cache, and also to derive its name. This function @@ -1325,6 +1345,15 @@ static inline void memcg_kmem_uncharge(struct page *page, int order) { } +static inline int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +{ + return 0; +} + +static inline void __memcg_kmem_uncharge(struct page *page, int order) +{ +} + #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) -- cgit v1.2.3 From aa9694bb78bf6eb03810108d5f6064fafa4ae1e1 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Tue, 5 Mar 2019 15:45:52 -0800 Subject: mm, memcg: create mem_cgroup_from_seq This is the start of a series of patches similar to my earlier DEFINE_MEMCG_MAX_OR_VAL work, but with less Macro Magic(tm). There are a bunch of places we go from seq_file to mem_cgroup, which currently requires manually getting the css, then getting the mem_cgroup from the css. It's in enough places now that having mem_cgroup_from_seq makes sense (and also makes the next patch a bit nicer). Link: http://lkml.kernel.org/r/20190124194050.GA31341@chrisdown.name Signed-off-by: Chris Down Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b0eb29ea0d9c..1f3d880b7ca1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -429,6 +429,11 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); +static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) +{ + return mem_cgroup_from_css(seq_css(m)); +} + static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { struct mem_cgroup_per_node *mz; @@ -937,6 +942,11 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return NULL; } +static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) +{ + return NULL; +} + static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { return NULL; -- cgit v1.2.3 From 0b3d6e6f2dd0a7b697b1aa8c167265908940624b Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 5 Apr 2019 18:39:18 -0700 Subject: mm: writeback: use exact memcg dirty counts Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in memory.stat reporting") memcg dirty and writeback counters are managed as: 1) per-memcg per-cpu values in range of [-32..32] 2) per-memcg atomic counter When a per-cpu counter cannot fit in [-32..32] it's flushed to the atomic. Stat readers only check the atomic. Thus readers such as balance_dirty_pages() may see a nontrivial error margin: 32 pages per cpu. Assuming 100 cpus: 4k x86 page_size: 13 MiB error per memcg 64k ppc page_size: 200 MiB error per memcg Considering that dirty+writeback are used together for some decisions the errors double. This inaccuracy can lead to undeserved oom kills. One nasty case is when all per-cpu counters hold positive values offsetting an atomic negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32). balance_dirty_pages() only consults the atomic and does not consider throttling the next n_cpu*32 dirty pages. If the file_lru is in the 13..200 MiB range then there's absolutely no dirty throttling, which burdens vmscan with only dirty+writeback pages thus resorting to oom kill. It could be argued that tiny containers are not supported, but it's more subtle. It's the amount the space available for file lru that matters. If a container has memory.max-200MiB of non reclaimable memory, then it will also suffer such oom kills on a 100 cpu machine. The following test reliably ooms without this patch. This patch avoids oom kills. $ cat test mount -t cgroup2 none /dev/cgroup cd /dev/cgroup echo +io +memory > cgroup.subtree_control mkdir test cd test echo 10M > memory.max (echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo) (echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100) $ cat memcg-writeback-stress.c /* * Dirty pages from all but one cpu. * Clean pages from the non dirtying cpu. * This is to stress per cpu counter imbalance. * On a 100 cpu machine: * - per memcg per cpu dirty count is 32 pages for each of 99 cpus * - per memcg atomic is -99*32 pages * - thus the complete dirty limit: sum of all counters 0 * - balance_dirty_pages() only sees atomic count -99*32 pages, which * it max()s to 0. * - So a workload can dirty -99*32 pages before balance_dirty_pages() * cares. */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include static char *buf; static int bufSize; static void set_affinity(int cpu) { cpu_set_t affinity; CPU_ZERO(&affinity); CPU_SET(cpu, &affinity); if (sched_setaffinity(0, sizeof(affinity), &affinity)) err(1, "sched_setaffinity"); } static void dirty_on(int output_fd, int cpu) { int i, wrote; set_affinity(cpu); for (i = 0; i < 32; i++) { for (wrote = 0; wrote < bufSize; ) { int ret = write(output_fd, buf+wrote, bufSize-wrote); if (ret == -1) err(1, "write"); wrote += ret; } } } int main(int argc, char **argv) { int cpu, flush_cpu = 1, output_fd; const char *output; if (argc != 2) errx(1, "usage: output_file"); output = argv[1]; bufSize = getpagesize(); buf = malloc(getpagesize()); if (buf == NULL) errx(1, "malloc failed"); output_fd = open(output, O_CREAT|O_RDWR); if (output_fd == -1) err(1, "open(%s)", output); for (cpu = 0; cpu < get_nprocs(); cpu++) { if (cpu != flush_cpu) dirty_on(output_fd, cpu); } set_affinity(flush_cpu); if (fsync(output_fd)) err(1, "fsync(%s)", output); if (close(output_fd)) err(1, "close(%s)", output); free(buf); } Make balance_dirty_pages() and wb_over_bg_thresh() work harder to collect exact per memcg counters. This avoids the aforementioned oom kills. This does not affect the overhead of memory.stat, which still reads the single atomic counter. Why not use percpu_counter? memcg already handles cpus going offline, so no need for that overhead from percpu_counter. And the percpu_counter spinlocks are more heavyweight than is required. It probably also makes sense to use exact dirty and writeback counters in memcg oom reports. But that is saved for later. Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com Signed-off-by: Greg Thelen Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Cc: [4.16+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1f3d880b7ca1..dbb6118370c1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -566,7 +566,10 @@ struct mem_cgroup *lock_page_memcg(struct page *page); void __unlock_page_memcg(struct mem_cgroup *memcg); void unlock_page_memcg(struct page *page); -/* idx can be of type enum memcg_stat_item or node_stat_item */ +/* + * idx can be of type enum memcg_stat_item or node_stat_item. + * Keep in sync with memcg_exact_page_state(). + */ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { -- cgit v1.2.3