From 1f4c025b5a5520fd2571244196b1b01ad96d18f6 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 26 Jul 2011 16:08:21 -0700 Subject: memcg: export memory cgroup's swappiness with mem_cgroup_swappiness() Each memory cgroup has a 'swappiness' value which can be accessed by get_swappiness(memcg). The major user is try_to_free_mem_cgroup_pages() and swappiness is passed by argument. It's propagated by scan_control. get_swappiness() is a static function but some planned updates will need to get swappiness from files other than memcontrol.c This patch exports get_swappiness() as mem_cgroup_swappiness(). With this, we can remove the argument of swapiness from try_to_free... and drop swappiness from scan_control. only memcg uses it. Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Michal Hocko Cc: Ying Han Cc: Shaohua Li Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e013b8e57d25..506d116a7d33 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -249,7 +249,7 @@ struct mem_cgroup { atomic_t oom_lock; atomic_t refcnt; - unsigned int swappiness; + int swappiness; /* OOM-Killer disable */ int oom_kill_disable; @@ -1329,7 +1329,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) return margin >> PAGE_SHIFT; } -static unsigned int get_swappiness(struct mem_cgroup *memcg) +int mem_cgroup_swappiness(struct mem_cgroup *memcg) { struct cgroup *cgrp = memcg->css.cgroup; @@ -1776,12 +1776,11 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, /* we use swappiness of local cgroup */ if (check_soft) { ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, get_swappiness(victim), zone, - &nr_scanned); + noswap, zone, &nr_scanned); *total_scanned += nr_scanned; } else ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap, get_swappiness(victim)); + noswap); css_put(&victim->css); /* * At shrinking usage, we can't check we should stop here or @@ -3826,7 +3825,7 @@ try_to_free: goto out; } progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, - false, get_swappiness(mem)); + false); if (!progress) { nr_retries--; /* maybe some writeback is necessary */ @@ -4288,7 +4287,7 @@ static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - return get_swappiness(memcg); + return mem_cgroup_swappiness(memcg); } static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, @@ -4997,7 +4996,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) INIT_LIST_HEAD(&mem->oom_notify); if (parent) - mem->swappiness = get_swappiness(parent); + mem->swappiness = mem_cgroup_swappiness(parent); atomic_set(&mem->refcnt, 1); mem->move_charge_at_immigrate = 0; mutex_init(&mem->thresholds_lock); -- cgit v1.2.3 From bb2a0de92c891b8feeedc0178acb3ae009d899a8 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 26 Jul 2011 16:08:22 -0700 Subject: memcg: consolidate memory cgroup lru stat functions In mm/memcontrol.c, there are many lru stat functions as.. mem_cgroup_zone_nr_lru_pages mem_cgroup_node_nr_file_lru_pages mem_cgroup_nr_file_lru_pages mem_cgroup_node_nr_anon_lru_pages mem_cgroup_nr_anon_lru_pages mem_cgroup_node_nr_unevictable_lru_pages mem_cgroup_nr_unevictable_lru_pages mem_cgroup_node_nr_lru_pages mem_cgroup_nr_lru_pages mem_cgroup_get_local_zonestat Some of them are under #ifdef MAX_NUMNODES >1 and others are not. This seems bad. This patch consolidates all functions into mem_cgroup_zone_nr_lru_pages() mem_cgroup_node_nr_lru_pages() mem_cgroup_nr_lru_pages() For these functions, "which LRU?" information is passed by a mask. example: mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON)) And I added some macro as ALL_LRU, ALL_LRU_FILE, ALL_LRU_ANON. example: mem_cgroup_nr_lru_pages(mem, ALL_LRU) BTW, considering layout of NUMA memory placement of counters, this patch seems to be better. Now, when we gather all LRU information, we scan in following orer for_each_lru -> for_each_node -> for_each_zone. This means we'll touch cache lines in different node in turn. After patch, we'll scan for_each_node -> for_each_zone -> for_each_lru(mask) Then, we'll gather information in the same cacheline at once. [akpm@linux-foundation.org: fix warnigns, build error] Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Michal Hocko Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 176 ++++++++++++++++---------------------------------------- 1 file changed, 49 insertions(+), 127 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 506d116a7d33..85599662bd90 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -636,27 +636,44 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, preempt_enable(); } -static unsigned long -mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx) +unsigned long +mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, + unsigned int lru_mask) { struct mem_cgroup_per_zone *mz; + enum lru_list l; + unsigned long ret = 0; + + mz = mem_cgroup_zoneinfo(mem, nid, zid); + + for_each_lru(l) { + if (BIT(l) & lru_mask) + ret += MEM_CGROUP_ZSTAT(mz, l); + } + return ret; +} + +static unsigned long +mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem, + int nid, unsigned int lru_mask) +{ u64 total = 0; int zid; - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = mem_cgroup_zoneinfo(mem, nid, zid); - total += MEM_CGROUP_ZSTAT(mz, idx); - } + for (zid = 0; zid < MAX_NR_ZONES; zid++) + total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask); + return total; } -static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, - enum lru_list idx) + +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem, + unsigned int lru_mask) { int nid; u64 total = 0; - for_each_online_node(nid) - total += mem_cgroup_get_zonestat_node(mem, nid, idx); + for_each_node_state(nid, N_HIGH_MEMORY) + total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask); return total; } @@ -1077,8 +1094,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_ unsigned long gb; unsigned long inactive_ratio; - inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); - active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); + inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); + active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) @@ -1117,109 +1134,12 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) unsigned long active; unsigned long inactive; - inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); - active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); + inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); + active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); return (active > inactive); } -unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, - struct zone *zone, - enum lru_list lru) -{ - int nid = zone_to_nid(zone); - int zid = zone_idx(zone); - struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); - - return MEM_CGROUP_ZSTAT(mz, lru); -} - -static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, - int nid) -{ - unsigned long ret; - - ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) + - mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE); - - return ret; -} - -static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, - int nid) -{ - unsigned long ret; - - ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + - mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); - return ret; -} - -#if MAX_NUMNODES > 1 -static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) -{ - u64 total = 0; - int nid; - - for_each_node_state(nid, N_HIGH_MEMORY) - total += mem_cgroup_node_nr_file_lru_pages(memcg, nid); - - return total; -} - -static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) -{ - u64 total = 0; - int nid; - - for_each_node_state(nid, N_HIGH_MEMORY) - total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid); - - return total; -} - -static unsigned long -mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid) -{ - return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE); -} - -static unsigned long -mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg) -{ - u64 total = 0; - int nid; - - for_each_node_state(nid, N_HIGH_MEMORY) - total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid); - - return total; -} - -static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid) -{ - enum lru_list l; - u64 total = 0; - - for_each_lru(l) - total += mem_cgroup_get_zonestat_node(memcg, nid, l); - - return total; -} - -static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg) -{ - u64 total = 0; - int nid; - - for_each_node_state(nid, N_HIGH_MEMORY) - total += mem_cgroup_node_nr_lru_pages(memcg, nid); - - return total; -} -#endif /* CONFIG_NUMA */ - struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone) { @@ -1576,11 +1496,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, int nid, bool noswap) { - if (mem_cgroup_node_nr_file_lru_pages(mem, nid)) + if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE)) return true; if (noswap || !total_swap_pages) return false; - if (mem_cgroup_node_nr_anon_lru_pages(mem, nid)) + if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON)) return true; return false; @@ -4151,15 +4071,15 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) s->stat[MCS_PGMAJFAULT] += val; /* per zone stat */ - val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); + val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON)); s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; - val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); + val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON)); s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; - val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); + val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE)); s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; - val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); + val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE)); s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; - val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); + val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE)); s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; } @@ -4181,35 +4101,37 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg) struct cgroup *cont = m->private; struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); - total_nr = mem_cgroup_nr_lru_pages(mem_cont); + total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); seq_printf(m, "total=%lu", total_nr); for_each_node_state(nid, N_HIGH_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid); + node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); seq_printf(m, " N%d=%lu", nid, node_nr); } seq_putc(m, '\n'); - file_nr = mem_cgroup_nr_file_lru_pages(mem_cont); + file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); seq_printf(m, "file=%lu", file_nr); for_each_node_state(nid, N_HIGH_MEMORY) { - node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid); + node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, + LRU_ALL_FILE); seq_printf(m, " N%d=%lu", nid, node_nr); } seq_putc(m, '\n'); - anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont); + anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); seq_printf(m, "anon=%lu", anon_nr); for_each_node_state(nid, N_HIGH_MEMORY) { - node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid); + node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, + LRU_ALL_ANON); seq_printf(m, " N%d=%lu", nid, node_nr); } seq_putc(m, '\n'); - unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont); + unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); seq_printf(m, "unevictable=%lu", unevictable_nr); for_each_node_state(nid, N_HIGH_MEMORY) { - node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont, - nid); + node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, + BIT(LRU_UNEVICTABLE)); seq_printf(m, " N%d=%lu", nid, node_nr); } seq_putc(m, '\n'); -- cgit v1.2.3 From 79dfdaccd1d5b40ff7cf4a35a0e63696ebb78b4d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 26 Jul 2011 16:08:23 -0700 Subject: memcg: make oom_lock 0 and 1 based rather than counter Commit 867578cb ("memcg: fix oom kill behavior") introduced a oom_lock counter which is incremented by mem_cgroup_oom_lock when we are about to handle memcg OOM situation. mem_cgroup_handle_oom falls back to a sleep if oom_lock > 1 to prevent from multiple oom kills at the same time. The counter is then decremented by mem_cgroup_oom_unlock called from the same function. This works correctly but it can lead to serious starvations when we have many processes triggering OOM and many CPUs available for them (I have tested with 16 CPUs). Consider a process (call it A) which gets the oom_lock (the first one that got to mem_cgroup_handle_oom and grabbed memcg_oom_mutex) and other processes that are blocked on the mutex. While A releases the mutex and calls mem_cgroup_out_of_memory others will wake up (one after another) and increase the counter and fall into sleep (memcg_oom_waitq). Once A finishes mem_cgroup_out_of_memory it takes the mutex again and decreases oom_lock and wakes other tasks (if releasing memory by somebody else - e.g. killed process - hasn't done it yet). A testcase would look like: Assume malloc XXX is a program allocating XXX Megabytes of memory which touches all allocated pages in a tight loop # swapoff SWAP_DEVICE # cgcreate -g memory:A # cgset -r memory.oom_control=0 A # cgset -r memory.limit_in_bytes= 200M # for i in `seq 100` # do # cgexec -g memory:A malloc 10 & # done The main problem here is that all processes still race for the mutex and there is no guarantee that we will get counter back to 0 for those that got back to mem_cgroup_handle_oom. In the end the whole convoy in/decreases the counter but we do not get to 1 that would enable killing so nothing useful can be done. The time is basically unbounded because it highly depends on scheduling and ordering on mutex (I have seen this taking hours...). This patch replaces the counter by a simple {un}lock semantic. As mem_cgroup_oom_{un}lock works on the a subtree of a hierarchy we have to make sure that nobody else races with us which is guaranteed by the memcg_oom_mutex. We have to be careful while locking subtrees because we can encounter a subtree which is already locked: hierarchy: A / \ B \ /\ \ C D E B - C - D tree might be already locked. While we want to enable locking E subtree because OOM situations cannot influence each other we definitely do not want to allow locking A. Therefore we have to refuse lock if any subtree is already locked and clear up the lock for all nodes that have been set up to the failure point. On the other hand we have to make sure that the rest of the world will recognize that a group is under OOM even though it doesn't have a lock. Therefore we have to introduce under_oom variable which is incremented and decremented for the whole subtree when we enter resp. leave mem_cgroup_handle_oom. under_oom, unlike oom_lock, doesn't need be updated under memcg_oom_mutex because its users only check a single group and they use atomic operations for that. This can be checked easily by the following test case: # cgcreate -g memory:A # cgset -r memory.use_hierarchy=1 A # cgset -r memory.oom_control=1 A # cgset -r memory.limit_in_bytes= 100M # cgset -r memory.memsw.limit_in_bytes= 100M # cgcreate -g memory:A/B # cgset -r memory.oom_control=1 A/B # cgset -r memory.limit_in_bytes=20M # cgset -r memory.memsw.limit_in_bytes=20M # cgexec -g memory:A/B malloc 30 & #->this will be blocked by OOM of group B # cgexec -g memory:A malloc 80 & #->this will be blocked by OOM of group A While B gets oom_lock A will not get it. Both of them go into sleep and wait for an external action. We can make the limit higher for A to enforce waking it up # cgset -r memory.memsw.limit_in_bytes=300M A # cgset -r memory.limit_in_bytes=300M A malloc in A has to wake up even though it doesn't have oom_lock. Finally, the unlock path is very easy because we always unlock only the subtree we have locked previously while we always decrement under_oom. Signed-off-by: Michal Hocko Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 70 insertions(+), 16 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 85599662bd90..95d6c256b54c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -246,7 +246,10 @@ struct mem_cgroup { * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; - atomic_t oom_lock; + + bool oom_lock; + atomic_t under_oom; + atomic_t refcnt; int swappiness; @@ -1722,37 +1725,83 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, /* * Check OOM-Killer is already running under our hierarchy. * If someone is running, return false. + * Has to be called with memcg_oom_mutex */ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) { - int x, lock_count = 0; - struct mem_cgroup *iter; + int lock_count = -1; + struct mem_cgroup *iter, *failed = NULL; + bool cond = true; - for_each_mem_cgroup_tree(iter, mem) { - x = atomic_inc_return(&iter->oom_lock); - lock_count = max(x, lock_count); + for_each_mem_cgroup_tree_cond(iter, mem, cond) { + bool locked = iter->oom_lock; + + iter->oom_lock = true; + if (lock_count == -1) + lock_count = iter->oom_lock; + else if (lock_count != locked) { + /* + * this subtree of our hierarchy is already locked + * so we cannot give a lock. + */ + lock_count = 0; + failed = iter; + cond = false; + } } - if (lock_count == 1) - return true; - return false; + if (!failed) + goto done; + + /* + * OK, we failed to lock the whole subtree so we have to clean up + * what we set up to the failing subtree + */ + cond = true; + for_each_mem_cgroup_tree_cond(iter, mem, cond) { + if (iter == failed) { + cond = false; + continue; + } + iter->oom_lock = false; + } +done: + return lock_count; } +/* + * Has to be called with memcg_oom_mutex + */ static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) { struct mem_cgroup *iter; + for_each_mem_cgroup_tree(iter, mem) + iter->oom_lock = false; + return 0; +} + +static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem) +{ + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, mem) + atomic_inc(&iter->under_oom); +} + +static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) +{ + struct mem_cgroup *iter; + /* * When a new child is created while the hierarchy is under oom, * mem_cgroup_oom_lock() may not be called. We have to use * atomic_add_unless() here. */ for_each_mem_cgroup_tree(iter, mem) - atomic_add_unless(&iter->oom_lock, -1, 0); - return 0; + atomic_add_unless(&iter->under_oom, -1, 0); } - static DEFINE_MUTEX(memcg_oom_mutex); static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); @@ -1794,7 +1843,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem) static void memcg_oom_recover(struct mem_cgroup *mem) { - if (mem && atomic_read(&mem->oom_lock)) + if (mem && atomic_read(&mem->under_oom)) memcg_wakeup_oom(mem); } @@ -1812,6 +1861,8 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) owait.wait.private = current; INIT_LIST_HEAD(&owait.wait.task_list); need_to_kill = true; + mem_cgroup_mark_under_oom(mem); + /* At first, try to OOM lock hierarchy under mem.*/ mutex_lock(&memcg_oom_mutex); locked = mem_cgroup_oom_lock(mem); @@ -1835,10 +1886,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) finish_wait(&memcg_oom_waitq, &owait.wait); } mutex_lock(&memcg_oom_mutex); - mem_cgroup_oom_unlock(mem); + if (locked) + mem_cgroup_oom_unlock(mem); memcg_wakeup_oom(mem); mutex_unlock(&memcg_oom_mutex); + mem_cgroup_unmark_under_oom(mem); + if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) return false; /* Give chance to dying process */ @@ -4505,7 +4559,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, list_add(&event->list, &memcg->oom_notify); /* already in OOM ? */ - if (atomic_read(&memcg->oom_lock)) + if (atomic_read(&memcg->under_oom)) eventfd_signal(eventfd, 1); mutex_unlock(&memcg_oom_mutex); @@ -4540,7 +4594,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp, cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); - if (atomic_read(&mem->oom_lock)) + if (atomic_read(&mem->under_oom)) cb->fill(cb, "under_oom", 1); else cb->fill(cb, "under_oom", 0); -- cgit v1.2.3 From 1af8efe965676ab30d6c8a5b1fccc9229f339a3b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 26 Jul 2011 16:08:24 -0700 Subject: memcg: change memcg_oom_mutex to spinlock memcg_oom_mutex is used to protect memcg OOM path and eventfd interface for oom_control. None of the critical sections which it protects sleep (eventfd_signal works from atomic context and the rest are simple linked list resp. oom_lock atomic operations). Mutex is also too heavyweight for those code paths because it triggers a lot of scheduling. It also makes makes convoying effects more visible when we have a big number of oom killing because we take the lock mutliple times during mem_cgroup_handle_oom so we have multiple places where many processes can sleep. Signed-off-by: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 95d6c256b54c..c0b065ec1571 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1725,7 +1725,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, /* * Check OOM-Killer is already running under our hierarchy. * If someone is running, return false. - * Has to be called with memcg_oom_mutex + * Has to be called with memcg_oom_lock */ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) { @@ -1770,7 +1770,7 @@ done: } /* - * Has to be called with memcg_oom_mutex + * Has to be called with memcg_oom_lock */ static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) { @@ -1802,7 +1802,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) atomic_add_unless(&iter->under_oom, -1, 0); } -static DEFINE_MUTEX(memcg_oom_mutex); +static DEFINE_SPINLOCK(memcg_oom_lock); static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); struct oom_wait_info { @@ -1864,7 +1864,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) mem_cgroup_mark_under_oom(mem); /* At first, try to OOM lock hierarchy under mem.*/ - mutex_lock(&memcg_oom_mutex); + spin_lock(&memcg_oom_lock); locked = mem_cgroup_oom_lock(mem); /* * Even if signal_pending(), we can't quit charge() loop without @@ -1876,7 +1876,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) need_to_kill = false; if (locked) mem_cgroup_oom_notify(mem); - mutex_unlock(&memcg_oom_mutex); + spin_unlock(&memcg_oom_lock); if (need_to_kill) { finish_wait(&memcg_oom_waitq, &owait.wait); @@ -1885,11 +1885,11 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) schedule(); finish_wait(&memcg_oom_waitq, &owait.wait); } - mutex_lock(&memcg_oom_mutex); + spin_lock(&memcg_oom_lock); if (locked) mem_cgroup_oom_unlock(mem); memcg_wakeup_oom(mem); - mutex_unlock(&memcg_oom_mutex); + spin_unlock(&memcg_oom_lock); mem_cgroup_unmark_under_oom(mem); @@ -4553,7 +4553,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, if (!event) return -ENOMEM; - mutex_lock(&memcg_oom_mutex); + spin_lock(&memcg_oom_lock); event->eventfd = eventfd; list_add(&event->list, &memcg->oom_notify); @@ -4561,7 +4561,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, /* already in OOM ? */ if (atomic_read(&memcg->under_oom)) eventfd_signal(eventfd, 1); - mutex_unlock(&memcg_oom_mutex); + spin_unlock(&memcg_oom_lock); return 0; } @@ -4575,7 +4575,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, BUG_ON(type != _OOM_TYPE); - mutex_lock(&memcg_oom_mutex); + spin_lock(&memcg_oom_lock); list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { if (ev->eventfd == eventfd) { @@ -4584,7 +4584,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, } } - mutex_unlock(&memcg_oom_mutex); + spin_unlock(&memcg_oom_lock); } static int mem_cgroup_oom_control_read(struct cgroup *cgrp, -- cgit v1.2.3 From 108b6a78463bb8c7163e4f9779f36ad8bbade334 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Tue, 26 Jul 2011 16:08:25 -0700 Subject: memcg: fix behavior of mem_cgroup_resize_limit() Commit 22a668d7c3ef ("memcg: fix behavior under memory.limit equals to memsw.limit") introduced "memsw_is_minimum" flag, which becomes true when mem_limit == memsw_limit. The flag is checked at the beginning of reclaim, and "noswap" is set if the flag is true, because using swap is meaningless in this case. This works well in most cases, but when we try to shrink mem_limit, which is the same as memsw_limit now, we might fail to shrink mem_limit because swap doesn't used. This patch fixes this behavior by: - check MEM_CGROUP_RECLAIM_SHRINK at the begining of reclaim - If it is set, don't set "noswap" flag even if memsw_is_minimum is true. Signed-off-by: Daisuke Nishimura Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Ying Han Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c0b065ec1571..dfeca594fd7a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1653,7 +1653,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; /* If memsw_is_minimum==1, swap-out is of-no-use. */ - if (!check_soft && root_mem->memsw_is_minimum) + if (!check_soft && !shrink && root_mem->memsw_is_minimum) noswap = true; while (1) { -- cgit v1.2.3 From 82f9d486e59f588c7d100865c36510644abda356 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 26 Jul 2011 16:08:26 -0700 Subject: memcg: add memory.vmscan_stat The commit log of 0ae5e89c60c9 ("memcg: count the soft_limit reclaim in...") says it adds scanning stats to memory.stat file. But it doesn't because we considered we needed to make a concensus for such new APIs. This patch is a trial to add memory.scan_stat. This shows - the number of scanned pages(total, anon, file) - the number of rotated pages(total, anon, file) - the number of freed pages(total, anon, file) - the number of elaplsed time (including sleep/pause time) for both of direct/soft reclaim. The biggest difference with oringinal Ying's one is that this file can be reset by some write, as # echo 0 ...../memory.scan_stat Example of output is here. This is a result after make -j 6 kernel under 300M limit. [kamezawa@bluextal ~]$ cat /cgroup/memory/A/memory.scan_stat [kamezawa@bluextal ~]$ cat /cgroup/memory/A/memory.vmscan_stat scanned_pages_by_limit 9471864 scanned_anon_pages_by_limit 6640629 scanned_file_pages_by_limit 2831235 rotated_pages_by_limit 4243974 rotated_anon_pages_by_limit 3971968 rotated_file_pages_by_limit 272006 freed_pages_by_limit 2318492 freed_anon_pages_by_limit 962052 freed_file_pages_by_limit 1356440 elapsed_ns_by_limit 351386416101 scanned_pages_by_system 0 scanned_anon_pages_by_system 0 scanned_file_pages_by_system 0 rotated_pages_by_system 0 rotated_anon_pages_by_system 0 rotated_file_pages_by_system 0 freed_pages_by_system 0 freed_anon_pages_by_system 0 freed_file_pages_by_system 0 elapsed_ns_by_system 0 scanned_pages_by_limit_under_hierarchy 9471864 scanned_anon_pages_by_limit_under_hierarchy 6640629 scanned_file_pages_by_limit_under_hierarchy 2831235 rotated_pages_by_limit_under_hierarchy 4243974 rotated_anon_pages_by_limit_under_hierarchy 3971968 rotated_file_pages_by_limit_under_hierarchy 272006 freed_pages_by_limit_under_hierarchy 2318492 freed_anon_pages_by_limit_under_hierarchy 962052 freed_file_pages_by_limit_under_hierarchy 1356440 elapsed_ns_by_limit_under_hierarchy 351386416101 scanned_pages_by_system_under_hierarchy 0 scanned_anon_pages_by_system_under_hierarchy 0 scanned_file_pages_by_system_under_hierarchy 0 rotated_pages_by_system_under_hierarchy 0 rotated_anon_pages_by_system_under_hierarchy 0 rotated_file_pages_by_system_under_hierarchy 0 freed_pages_by_system_under_hierarchy 0 freed_anon_pages_by_system_under_hierarchy 0 freed_file_pages_by_system_under_hierarchy 0 elapsed_ns_by_system_under_hierarchy 0 total_xxxx is for hierarchy management. This will be useful for further memcg developments and need to be developped before we do some complicated rework on LRU/softlimit management. This patch adds a new struct memcg_scanrecord into scan_control struct. sc->nr_scanned at el is not designed for exporting information. For example, nr_scanned is reset frequentrly and incremented +2 at scanning mapped pages. To avoid complexity, I added a new param in scan_control which is for exporting scanning score. Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Michal Hocko Cc: Ying Han Cc: Andrew Bresticker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 166 insertions(+), 6 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dfeca594fd7a..04e505bfd7dd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -205,6 +205,50 @@ struct mem_cgroup_eventfd_list { static void mem_cgroup_threshold(struct mem_cgroup *mem); static void mem_cgroup_oom_notify(struct mem_cgroup *mem); +enum { + SCAN_BY_LIMIT, + SCAN_BY_SYSTEM, + NR_SCAN_CONTEXT, + SCAN_BY_SHRINK, /* not recorded now */ +}; + +enum { + SCAN, + SCAN_ANON, + SCAN_FILE, + ROTATE, + ROTATE_ANON, + ROTATE_FILE, + FREED, + FREED_ANON, + FREED_FILE, + ELAPSED, + NR_SCANSTATS, +}; + +struct scanstat { + spinlock_t lock; + unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS]; + unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS]; +}; + +const char *scanstat_string[NR_SCANSTATS] = { + "scanned_pages", + "scanned_anon_pages", + "scanned_file_pages", + "rotated_pages", + "rotated_anon_pages", + "rotated_file_pages", + "freed_pages", + "freed_anon_pages", + "freed_file_pages", + "elapsed_ns", +}; +#define SCANSTAT_WORD_LIMIT "_by_limit" +#define SCANSTAT_WORD_SYSTEM "_by_system" +#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy" + + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -270,7 +314,8 @@ struct mem_cgroup { /* For oom notifier event fd */ struct list_head oom_notify; - + /* For recording LRU-scan statistics */ + struct scanstat scanstat; /* * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? @@ -1623,6 +1668,44 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) } #endif +static void __mem_cgroup_record_scanstat(unsigned long *stats, + struct memcg_scanrecord *rec) +{ + + stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1]; + stats[SCAN_ANON] += rec->nr_scanned[0]; + stats[SCAN_FILE] += rec->nr_scanned[1]; + + stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1]; + stats[ROTATE_ANON] += rec->nr_rotated[0]; + stats[ROTATE_FILE] += rec->nr_rotated[1]; + + stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1]; + stats[FREED_ANON] += rec->nr_freed[0]; + stats[FREED_FILE] += rec->nr_freed[1]; + + stats[ELAPSED] += rec->elapsed; +} + +static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec) +{ + struct mem_cgroup *mem; + int context = rec->context; + + if (context >= NR_SCAN_CONTEXT) + return; + + mem = rec->mem; + spin_lock(&mem->scanstat.lock); + __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec); + spin_unlock(&mem->scanstat.lock); + + mem = rec->root; + spin_lock(&mem->scanstat.lock); + __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec); + spin_unlock(&mem->scanstat.lock); +} + /* * Scan the hierarchy if needed to reclaim memory. We remember the last child * we reclaimed from, so that we don't end up penalizing one child extensively @@ -1647,8 +1730,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; + struct memcg_scanrecord rec; unsigned long excess; - unsigned long nr_scanned; + unsigned long scanned; excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; @@ -1656,6 +1740,15 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (!check_soft && !shrink && root_mem->memsw_is_minimum) noswap = true; + if (shrink) + rec.context = SCAN_BY_SHRINK; + else if (check_soft) + rec.context = SCAN_BY_SYSTEM; + else + rec.context = SCAN_BY_LIMIT; + + rec.root = root_mem; + while (1) { victim = mem_cgroup_select_victim(root_mem); if (victim == root_mem) { @@ -1696,14 +1789,23 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, css_put(&victim->css); continue; } + rec.mem = victim; + rec.nr_scanned[0] = 0; + rec.nr_scanned[1] = 0; + rec.nr_rotated[0] = 0; + rec.nr_rotated[1] = 0; + rec.nr_freed[0] = 0; + rec.nr_freed[1] = 0; + rec.elapsed = 0; /* we use swappiness of local cgroup */ if (check_soft) { ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, zone, &nr_scanned); - *total_scanned += nr_scanned; + noswap, zone, &rec, &scanned); + *total_scanned += scanned; } else ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap); + noswap, &rec); + mem_cgroup_record_scanstat(&rec); css_put(&victim->css); /* * At shrinking usage, we can't check we should stop here or @@ -3792,14 +3894,18 @@ try_to_free: /* try to free all pages in this cgroup */ shrink = 1; while (nr_retries && mem->res.usage > 0) { + struct memcg_scanrecord rec; int progress; if (signal_pending(current)) { ret = -EINTR; goto out; } + rec.context = SCAN_BY_SHRINK; + rec.mem = mem; + rec.root = mem; progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, - false); + false, &rec); if (!progress) { nr_retries--; /* maybe some writeback is necessary */ @@ -4643,6 +4749,54 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file) } #endif /* CONFIG_NUMA */ +static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp, + struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + char string[64]; + int i; + + for (i = 0; i < NR_SCANSTATS; i++) { + strcpy(string, scanstat_string[i]); + strcat(string, SCANSTAT_WORD_LIMIT); + cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]); + } + + for (i = 0; i < NR_SCANSTATS; i++) { + strcpy(string, scanstat_string[i]); + strcat(string, SCANSTAT_WORD_SYSTEM); + cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]); + } + + for (i = 0; i < NR_SCANSTATS; i++) { + strcpy(string, scanstat_string[i]); + strcat(string, SCANSTAT_WORD_LIMIT); + strcat(string, SCANSTAT_WORD_HIERARCHY); + cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]); + } + for (i = 0; i < NR_SCANSTATS; i++) { + strcpy(string, scanstat_string[i]); + strcat(string, SCANSTAT_WORD_SYSTEM); + strcat(string, SCANSTAT_WORD_HIERARCHY); + cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]); + } + return 0; +} + +static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp, + unsigned int event) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + + spin_lock(&mem->scanstat.lock); + memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats)); + memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats)); + spin_unlock(&mem->scanstat.lock); + return 0; +} + + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -4713,6 +4867,11 @@ static struct cftype mem_cgroup_files[] = { .mode = S_IRUGO, }, #endif + { + .name = "vmscan_stat", + .read_map = mem_cgroup_vmscan_stat_read, + .trigger = mem_cgroup_reset_vmscan_stat, + }, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -4976,6 +5135,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) atomic_set(&mem->refcnt, 1); mem->move_charge_at_immigrate = 0; mutex_init(&mem->thresholds_lock); + spin_lock_init(&mem->scanstat.lock); return &mem->css; free_out: __mem_cgroup_free(mem); -- cgit v1.2.3 From d1a05b6973c7cb33144fa965d73facc708ffc37d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 26 Jul 2011 16:08:27 -0700 Subject: memcg: do not try to drain per-cpu caches without pages drain_all_stock_async tries to optimize a work to be done on the work queue by excluding any work for the current CPU because it assumes that the context we are called from already tried to charge from that cache and it's failed so it must be empty already. While the assumption is correct we can optimize it even more by checking the current number of pages in the cache. This will also reduce a work on other CPUs with an empty stock. For the current CPU we can simply call drain_local_stock rather than deferring it to the work queue. [kamezawa.hiroyu@jp.fujitsu.com: use drain_local_stock for current CPU optimization] Signed-off-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 04e505bfd7dd..2f5534e1968c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2180,11 +2180,8 @@ static void drain_all_stock_async(struct mem_cgroup *root_mem) struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *mem; - if (cpu == curcpu) - continue; - mem = stock->cached; - if (!mem) + if (!mem || !stock->nr_pages) continue; if (mem != root_mem) { if (!root_mem->use_hierarchy) @@ -2193,8 +2190,12 @@ static void drain_all_stock_async(struct mem_cgroup *root_mem) if (!css_is_ancestor(&mem->css, &root_mem->css)) continue; } - if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) - schedule_work_on(cpu, &stock->work); + if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { + if (cpu == curcpu) + drain_local_stock(&stock->work); + else + schedule_work_on(cpu, &stock->work); + } } put_online_cpus(); mutex_unlock(&percpu_charge_mutex); -- cgit v1.2.3 From d38144b7a5f8d0a5e05d549177191374c6911009 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 26 Jul 2011 16:08:28 -0700 Subject: memcg: unify sync and async per-cpu charge cache draining Currently we have two ways how to drain per-CPU caches for charges. drain_all_stock_sync will synchronously drain all caches while drain_all_stock_async will asynchronously drain only those that refer to a given memory cgroup or its subtree in hierarchy. Targeted async draining has been introduced by 26fe6168 (memcg: fix percpu cached charge draining frequency) to reduce the cpu workers number. sync draining is currently triggered only from mem_cgroup_force_empty which is triggered only by userspace (mem_cgroup_force_empty_write) or when a cgroup is removed (mem_cgroup_pre_destroy). Although these are not usually frequent operations it still makes some sense to do targeted draining as well, especially if the box has many CPUs. This patch unifies both methods to use the single code (drain_all_stock) which relies on the original async implementation and just adds flush_work to wait on all caches that are still under work for the sync mode. We are using FLUSHING_CACHED_CHARGE bit check to prevent from waiting on a work that we haven't triggered. Please note that both sync and async functions are currently protected by percpu_charge_mutex so we cannot race with other drainers. Signed-off-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 48 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 14 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2f5534e1968c..af920d0f9025 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2154,19 +2154,14 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) } /* - * Tries to drain stocked charges in other cpus. This function is asynchronous - * and just put a work per cpu for draining localy on each cpu. Caller can - * expects some charges will be back to res_counter later but cannot wait for - * it. + * Drains all per-CPU charge caches for given root_mem resp. subtree + * of the hierarchy under it. sync flag says whether we should block + * until the work is done. */ -static void drain_all_stock_async(struct mem_cgroup *root_mem) +static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) { int cpu, curcpu; - /* - * If someone calls draining, avoid adding more kworker runs. - */ - if (!mutex_trylock(&percpu_charge_mutex)) - return; + /* Notify other cpus that system-wide "drain" is running */ get_online_cpus(); /* @@ -2197,17 +2192,42 @@ static void drain_all_stock_async(struct mem_cgroup *root_mem) schedule_work_on(cpu, &stock->work); } } + + if (!sync) + goto out; + + for_each_online_cpu(cpu) { + struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); + if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) + flush_work(&stock->work); + } +out: put_online_cpus(); +} + +/* + * Tries to drain stocked charges in other cpus. This function is asynchronous + * and just put a work per cpu for draining localy on each cpu. Caller can + * expects some charges will be back to res_counter later but cannot wait for + * it. + */ +static void drain_all_stock_async(struct mem_cgroup *root_mem) +{ + /* + * If someone calls draining, avoid adding more kworker runs. + */ + if (!mutex_trylock(&percpu_charge_mutex)) + return; + drain_all_stock(root_mem, false); mutex_unlock(&percpu_charge_mutex); - /* We don't wait for flush_work */ } /* This is a synchronous drain interface. */ -static void drain_all_stock_sync(void) +static void drain_all_stock_sync(struct mem_cgroup *root_mem) { /* called when force_empty is called */ mutex_lock(&percpu_charge_mutex); - schedule_on_each_cpu(drain_local_stock); + drain_all_stock(root_mem, true); mutex_unlock(&percpu_charge_mutex); } @@ -3856,7 +3876,7 @@ move_account: goto out; /* This is for making all *used* pages to be on LRU. */ lru_add_drain_all(); - drain_all_stock_sync(); + drain_all_stock_sync(mem); ret = 0; mem_cgroup_start_move(mem); for_each_node_state(node, N_HIGH_MEMORY) { -- cgit v1.2.3 From 3e92041d68b40c47faa34c7dc08fc650a6c36adc Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 26 Jul 2011 16:08:29 -0700 Subject: memcg: add mem_cgroup_same_or_subtree() helper We are checking whether a given two groups are same or at least in the same subtree of a hierarchy at several places. Let's make a helper for it to make code easier to read. Signed-off-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 51 ++++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 25 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index af920d0f9025..79f23a189941 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1108,6 +1108,21 @@ void mem_cgroup_move_lists(struct page *page, mem_cgroup_add_lru_list(page, to); } +/* + * Checks whether given mem is same or in the root_mem's + * hierarchy subtree + */ +static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem, + struct mem_cgroup *mem) +{ + if (root_mem != mem) { + return (root_mem->use_hierarchy && + css_is_ancestor(&mem->css, &root_mem->css)); + } + + return true; +} + int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) { int ret; @@ -1127,10 +1142,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) * enabled in "curr" and "curr" is a child of "mem" in *cgroup* * hierarchy(even if use_hierarchy is disabled in "mem"). */ - if (mem->use_hierarchy) - ret = css_is_ancestor(&curr->css, &mem->css); - else - ret = (curr == mem); + ret = mem_cgroup_same_or_subtree(mem, curr); css_put(&curr->css); return ret; } @@ -1369,10 +1381,9 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem) to = mc.to; if (!from) goto unlock; - if (from == mem || to == mem - || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) - || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) - ret = true; + + ret = mem_cgroup_same_or_subtree(mem, from) + || mem_cgroup_same_or_subtree(mem, to); unlock: spin_unlock(&mc.lock); return ret; @@ -1915,25 +1926,20 @@ struct oom_wait_info { static int memcg_oom_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { - struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; + struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg, + *oom_wait_mem; struct oom_wait_info *oom_wait_info; oom_wait_info = container_of(wait, struct oom_wait_info, wait); + oom_wait_mem = oom_wait_info->mem; - if (oom_wait_info->mem == wake_mem) - goto wakeup; - /* if no hierarchy, no match */ - if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) - return 0; /* * Both of oom_wait_info->mem and wake_mem are stable under us. * Then we can use css_is_ancestor without taking care of RCU. */ - if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && - !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) + if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem) + && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem)) return 0; - -wakeup: return autoremove_wake_function(wait, mode, sync, arg); } @@ -2178,13 +2184,8 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) mem = stock->cached; if (!mem || !stock->nr_pages) continue; - if (mem != root_mem) { - if (!root_mem->use_hierarchy) - continue; - /* check whether "mem" is under tree of "root_mem" */ - if (!css_is_ancestor(&mem->css, &root_mem->css)) - continue; - } + if (!mem_cgroup_same_or_subtree(root_mem, mem)) + continue; if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { if (cpu == curcpu) drain_local_stock(&stock->work); -- cgit v1.2.3 From 8521fc50d433507a7cdc96bec280f9e5888a54cc Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 26 Jul 2011 16:08:29 -0700 Subject: memcg: get rid of percpu_charge_mutex lock percpu_charge_mutex protects from multiple simultaneous per-cpu charge caches draining because we might end up having too many work items. At least this was the case until commit 26fe61684449 ("memcg: fix percpu cached charge draining frequency") when we introduced a more targeted draining for async mode. Now that also sync draining is targeted we can safely remove mutex because we will not send more work than the current number of CPUs. FLUSHING_CACHED_CHARGE protects from sending the same work multiple times and stock->nr_pages == 0 protects from pointless sending a work if there is obviously nothing to be done. This is of course racy but we can live with it as the race window is really small (we would have to see FLUSHING_CACHED_CHARGE cleared while nr_pages would be still non-zero). The only remaining place where we can race is synchronous mode when we rely on FLUSHING_CACHED_CHARGE test which might have been set by other drainer on the same group but we should wait in that case as well. Signed-off-by: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 79f23a189941..5f84d2351ddb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2092,7 +2092,6 @@ struct memcg_stock_pcp { #define FLUSHING_CACHED_CHARGE (0) }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); -static DEFINE_MUTEX(percpu_charge_mutex); /* * Try to consume stocked charge on this cpu. If success, one page is consumed @@ -2199,7 +2198,8 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); - if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) + if (mem_cgroup_same_or_subtree(root_mem, stock->cached) && + test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) flush_work(&stock->work); } out: @@ -2214,22 +2214,14 @@ out: */ static void drain_all_stock_async(struct mem_cgroup *root_mem) { - /* - * If someone calls draining, avoid adding more kworker runs. - */ - if (!mutex_trylock(&percpu_charge_mutex)) - return; drain_all_stock(root_mem, false); - mutex_unlock(&percpu_charge_mutex); } /* This is a synchronous drain interface. */ static void drain_all_stock_sync(struct mem_cgroup *root_mem) { /* called when force_empty is called */ - mutex_lock(&percpu_charge_mutex); drain_all_stock(root_mem, true); - mutex_unlock(&percpu_charge_mutex); } /* -- cgit v1.2.3 From aa3b189551ad8e5cc1d9c663735c131650238278 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 3 Aug 2011 16:21:24 -0700 Subject: tmpfs: convert mem_cgroup shmem to radix-swap Remove mem_cgroup_shmem_charge_fallback(): it was only required when we had to move swappage to filecache with GFP_NOWAIT. Remove the GFP_NOWAIT special case from mem_cgroup_cache_charge(), by moving its call out from shmem_add_to_page_cache() to two of thats three callers. But leave it doing mem_cgroup_uncharge_cache_page() on error: although asymmetrical, it's easier for all 3 callers to handle. These two changes would also be appropriate if anyone were to start using shmem_read_mapping_page_gfp() with GFP_NOWAIT. Remove mem_cgroup_get_shmem_target(): mc_handle_file_pte() can test radix_tree_exceptional_entry() to get what it needs for itself. Signed-off-by: Hugh Dickins Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 66 ++++++++------------------------------------------------- 1 file changed, 9 insertions(+), 57 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5f84d2351ddb..f4ec4e7ca4cd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -2873,30 +2872,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, return 0; if (PageCompound(page)) return 0; - /* - * Corner case handling. This is called from add_to_page_cache() - * in usual. But some FS (shmem) precharges this page before calling it - * and call add_to_page_cache() with GFP_NOWAIT. - * - * For GFP_NOWAIT case, the page may be pre-charged before calling - * add_to_page_cache(). (See shmem.c) check it here and avoid to call - * charge twice. (It works but has to pay a bit larger cost.) - * And when the page is SwapCache, it should take swap information - * into account. This is under lock_page() now. - */ - if (!(gfp_mask & __GFP_WAIT)) { - struct page_cgroup *pc; - - pc = lookup_page_cgroup(page); - if (!pc) - return 0; - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - unlock_page_cgroup(pc); - return 0; - } - unlock_page_cgroup(pc); - } if (unlikely(!mm)) mm = &init_mm; @@ -3486,31 +3461,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, cgroup_release_and_wakeup_rmdir(&mem->css); } -/* - * A call to try to shrink memory usage on charge failure at shmem's swapin. - * Calling hierarchical_reclaim is not enough because we should update - * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. - * Moreover considering hierarchy, we should reclaim from the mem_over_limit, - * not from the memcg which this page would be charged to. - * try_charge_swapin does all of these works properly. - */ -int mem_cgroup_shmem_charge_fallback(struct page *page, - struct mm_struct *mm, - gfp_t gfp_mask) -{ - struct mem_cgroup *mem; - int ret; - - if (mem_cgroup_disabled()) - return 0; - - ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); - if (!ret) - mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ - - return ret; -} - #ifdef CONFIG_DEBUG_VM static struct page_cgroup *lookup_page_cgroup_used(struct page *page) { @@ -5330,15 +5280,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, pgoff = pte_to_pgoff(ptent); /* page is moved even if it's not RSS of this task(page-faulted). */ - if (!mapping_cap_swap_backed(mapping)) { /* normal file */ - page = find_get_page(mapping, pgoff); - } else { /* shmem/tmpfs file. we should take account of swap too. */ - swp_entry_t ent; - mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); + page = find_get_page(mapping, pgoff); + +#ifdef CONFIG_SWAP + /* shmem/tmpfs may report page out on swap: account for that too. */ + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swap = radix_to_swp_entry(page); if (do_swap_account) - entry->val = ent.val; + *entry = swap; + page = find_get_page(&swapper_space, swap.val); } - +#endif return page; } -- cgit v1.2.3 From 9f50fad65b87a8776ae989ca059ad6c17925dfc3 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 9 Aug 2011 11:56:26 +0200 Subject: Revert "memcg: get rid of percpu_charge_mutex lock" This reverts commit 8521fc50d433507a7cdc96bec280f9e5888a54cc. The patch incorrectly assumes that using atomic FLUSHING_CACHED_CHARGE bit operations is sufficient but that is not true. Johannes Weiner has reported a crash during parallel memory cgroup removal: BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 IP: [] css_is_ancestor+0x20/0x70 Oops: 0000 [#1] PREEMPT SMP Pid: 19677, comm: rmdir Tainted: G W 3.0.0-mm1-00188-gf38d32b #35 ECS MCP61M-M3/MCP61M-M3 RIP: 0010:[] css_is_ancestor+0x20/0x70 RSP: 0018:ffff880077b09c88 EFLAGS: 00010202 Process rmdir (pid: 19677, threadinfo ffff880077b08000, task ffff8800781bb310) Call Trace: [] mem_cgroup_same_or_subtree+0x33/0x40 [] drain_all_stock+0x11f/0x170 [] mem_cgroup_force_empty+0x231/0x6d0 [] mem_cgroup_pre_destroy+0x14/0x20 [] cgroup_rmdir+0xb9/0x500 [] vfs_rmdir+0x86/0xe0 [] do_rmdir+0xfb/0x110 [] sys_rmdir+0x16/0x20 [] system_call_fastpath+0x16/0x1b We are crashing because we try to dereference cached memcg when we are checking whether we should wait for draining on the cache. The cache is already cleaned up, though. There is also a theoretical chance that the cached memcg gets freed between we test for the FLUSHING_CACHED_CHARGE and dereference it in mem_cgroup_same_or_subtree: CPU0 CPU1 CPU2 mem=stock->cached stock->cached=NULL clear_bit test_and_set_bit test_bit() ... mem_cgroup_destroy use after free The percpu_charge_mutex protected from this race because sync draining is exclusive. It is safer to revert now and come up with a more parallel implementation later. Signed-off-by: Michal Hocko Reported-by: Johannes Weiner Acked-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f4ec4e7ca4cd..930de9437271 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2091,6 +2091,7 @@ struct memcg_stock_pcp { #define FLUSHING_CACHED_CHARGE (0) }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); +static DEFINE_MUTEX(percpu_charge_mutex); /* * Try to consume stocked charge on this cpu. If success, one page is consumed @@ -2197,8 +2198,7 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); - if (mem_cgroup_same_or_subtree(root_mem, stock->cached) && - test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) + if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) flush_work(&stock->work); } out: @@ -2213,14 +2213,22 @@ out: */ static void drain_all_stock_async(struct mem_cgroup *root_mem) { + /* + * If someone calls draining, avoid adding more kworker runs. + */ + if (!mutex_trylock(&percpu_charge_mutex)) + return; drain_all_stock(root_mem, false); + mutex_unlock(&percpu_charge_mutex); } /* This is a synchronous drain interface. */ static void drain_all_stock_sync(struct mem_cgroup *root_mem) { /* called when force_empty is called */ + mutex_lock(&percpu_charge_mutex); drain_all_stock(root_mem, true); + mutex_unlock(&percpu_charge_mutex); } /* -- cgit v1.2.3 From 5af12d0efdbd9967cc71a0a10c4025c4255a6254 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 25 Aug 2011 15:59:07 -0700 Subject: memcg: pin execution to current cpu while draining stock Commit d1a05b6973c7 ("memcg do not try to drain per-cpu caches without pages") added a drain_local_stock() call to a preemptible section. The draining task looks up the cpu-local stock twice to set the draining-flag, then to drain the stock and clear the flag again. If the task is migrated to a different CPU in between, noone will clear the flag on the first stock and it will be forever undrainable. Its charge can not be recovered and the cgroup can not be deleted anymore. Properly pin the task to the executing CPU while draining stocks. Signed-off-by: Johannes Weiner Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 930de9437271..0e40f0205732 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2169,13 +2169,7 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) /* Notify other cpus that system-wide "drain" is running */ get_online_cpus(); - /* - * Get a hint for avoiding draining charges on the current cpu, - * which must be exhausted by our charging. It is not required that - * this be a precise check, so we use raw_smp_processor_id() instead of - * getcpu()/putcpu(). - */ - curcpu = raw_smp_processor_id(); + curcpu = get_cpu(); for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *mem; @@ -2192,6 +2186,7 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) schedule_work_on(cpu, &stock->work); } } + put_cpu(); if (!sync) goto out; -- cgit v1.2.3 From 23751be0094012eb6b4756fa80ca54b3eb83069f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 25 Aug 2011 15:59:16 -0700 Subject: memcg: fix hierarchical oom locking Commit 79dfdaccd1d5 ("memcg: make oom_lock 0 and 1 based rather than counter") tried to oom lock the hierarchy and roll back upon encountering an already locked memcg. The code is confused when it comes to detecting a locked memcg, though, so it would fail and rollback after locking one memcg and encountering an unlocked second one. The result is that oom-locking hierarchies fails unconditionally and that every oom killer invocation simply goes to sleep on the oom waitqueue forever. The tasks practically hang forever without anyone intervening, possibly holding locks that trip up unrelated tasks, too. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0e40f0205732..ebd1e86bef1c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1841,29 +1841,23 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, */ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) { - int lock_count = -1; struct mem_cgroup *iter, *failed = NULL; bool cond = true; for_each_mem_cgroup_tree_cond(iter, mem, cond) { - bool locked = iter->oom_lock; - - iter->oom_lock = true; - if (lock_count == -1) - lock_count = iter->oom_lock; - else if (lock_count != locked) { + if (iter->oom_lock) { /* * this subtree of our hierarchy is already locked * so we cannot give a lock. */ - lock_count = 0; failed = iter; cond = false; - } + } else + iter->oom_lock = true; } if (!failed) - goto done; + return true; /* * OK, we failed to lock the whole subtree so we have to clean up @@ -1877,8 +1871,7 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) } iter->oom_lock = false; } -done: - return lock_count; + return false; } /* -- cgit v1.2.3 From 185efc0f9a1f2d6ad6d4782c5d9e529f3290567f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 14 Sep 2011 16:21:58 -0700 Subject: memcg: Revert "memcg: add memory.vmscan_stat" Revert the post-3.0 commit 82f9d486e59f5 ("memcg: add memory.vmscan_stat"). The implementation of per-memcg reclaim statistics violates how memcg hierarchies usually behave: hierarchically. The reclaim statistics are accounted to child memcgs and the parent hitting the limit, but not to hierarchy levels in between. Usually, hierarchical statistics are perfectly recursive, with each level representing the sum of itself and all its children. Since this exports statistics to userspace, this may lead to confusion and problems with changing things after the release, so revert it now, we can try again later. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Michal Hocko Cc: Ying Han Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 172 ++------------------------------------------------------ 1 file changed, 6 insertions(+), 166 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ebd1e86bef1c..3508777837c7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -204,50 +204,6 @@ struct mem_cgroup_eventfd_list { static void mem_cgroup_threshold(struct mem_cgroup *mem); static void mem_cgroup_oom_notify(struct mem_cgroup *mem); -enum { - SCAN_BY_LIMIT, - SCAN_BY_SYSTEM, - NR_SCAN_CONTEXT, - SCAN_BY_SHRINK, /* not recorded now */ -}; - -enum { - SCAN, - SCAN_ANON, - SCAN_FILE, - ROTATE, - ROTATE_ANON, - ROTATE_FILE, - FREED, - FREED_ANON, - FREED_FILE, - ELAPSED, - NR_SCANSTATS, -}; - -struct scanstat { - spinlock_t lock; - unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS]; - unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS]; -}; - -const char *scanstat_string[NR_SCANSTATS] = { - "scanned_pages", - "scanned_anon_pages", - "scanned_file_pages", - "rotated_pages", - "rotated_anon_pages", - "rotated_file_pages", - "freed_pages", - "freed_anon_pages", - "freed_file_pages", - "elapsed_ns", -}; -#define SCANSTAT_WORD_LIMIT "_by_limit" -#define SCANSTAT_WORD_SYSTEM "_by_system" -#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy" - - /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -313,8 +269,7 @@ struct mem_cgroup { /* For oom notifier event fd */ struct list_head oom_notify; - /* For recording LRU-scan statistics */ - struct scanstat scanstat; + /* * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? @@ -1678,44 +1633,6 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) } #endif -static void __mem_cgroup_record_scanstat(unsigned long *stats, - struct memcg_scanrecord *rec) -{ - - stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1]; - stats[SCAN_ANON] += rec->nr_scanned[0]; - stats[SCAN_FILE] += rec->nr_scanned[1]; - - stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1]; - stats[ROTATE_ANON] += rec->nr_rotated[0]; - stats[ROTATE_FILE] += rec->nr_rotated[1]; - - stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1]; - stats[FREED_ANON] += rec->nr_freed[0]; - stats[FREED_FILE] += rec->nr_freed[1]; - - stats[ELAPSED] += rec->elapsed; -} - -static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec) -{ - struct mem_cgroup *mem; - int context = rec->context; - - if (context >= NR_SCAN_CONTEXT) - return; - - mem = rec->mem; - spin_lock(&mem->scanstat.lock); - __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec); - spin_unlock(&mem->scanstat.lock); - - mem = rec->root; - spin_lock(&mem->scanstat.lock); - __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec); - spin_unlock(&mem->scanstat.lock); -} - /* * Scan the hierarchy if needed to reclaim memory. We remember the last child * we reclaimed from, so that we don't end up penalizing one child extensively @@ -1740,9 +1657,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; - struct memcg_scanrecord rec; unsigned long excess; - unsigned long scanned; + unsigned long nr_scanned; excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; @@ -1750,15 +1666,6 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (!check_soft && !shrink && root_mem->memsw_is_minimum) noswap = true; - if (shrink) - rec.context = SCAN_BY_SHRINK; - else if (check_soft) - rec.context = SCAN_BY_SYSTEM; - else - rec.context = SCAN_BY_LIMIT; - - rec.root = root_mem; - while (1) { victim = mem_cgroup_select_victim(root_mem); if (victim == root_mem) { @@ -1799,23 +1706,14 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, css_put(&victim->css); continue; } - rec.mem = victim; - rec.nr_scanned[0] = 0; - rec.nr_scanned[1] = 0; - rec.nr_rotated[0] = 0; - rec.nr_rotated[1] = 0; - rec.nr_freed[0] = 0; - rec.nr_freed[1] = 0; - rec.elapsed = 0; /* we use swappiness of local cgroup */ if (check_soft) { ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, zone, &rec, &scanned); - *total_scanned += scanned; + noswap, zone, &nr_scanned); + *total_scanned += nr_scanned; } else ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap, &rec); - mem_cgroup_record_scanstat(&rec); + noswap); css_put(&victim->css); /* * At shrinking usage, we can't check we should stop here or @@ -3854,18 +3752,14 @@ try_to_free: /* try to free all pages in this cgroup */ shrink = 1; while (nr_retries && mem->res.usage > 0) { - struct memcg_scanrecord rec; int progress; if (signal_pending(current)) { ret = -EINTR; goto out; } - rec.context = SCAN_BY_SHRINK; - rec.mem = mem; - rec.root = mem; progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, - false, &rec); + false); if (!progress) { nr_retries--; /* maybe some writeback is necessary */ @@ -4709,54 +4603,6 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file) } #endif /* CONFIG_NUMA */ -static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp, - struct cftype *cft, - struct cgroup_map_cb *cb) -{ - struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); - char string[64]; - int i; - - for (i = 0; i < NR_SCANSTATS; i++) { - strcpy(string, scanstat_string[i]); - strcat(string, SCANSTAT_WORD_LIMIT); - cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]); - } - - for (i = 0; i < NR_SCANSTATS; i++) { - strcpy(string, scanstat_string[i]); - strcat(string, SCANSTAT_WORD_SYSTEM); - cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]); - } - - for (i = 0; i < NR_SCANSTATS; i++) { - strcpy(string, scanstat_string[i]); - strcat(string, SCANSTAT_WORD_LIMIT); - strcat(string, SCANSTAT_WORD_HIERARCHY); - cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]); - } - for (i = 0; i < NR_SCANSTATS; i++) { - strcpy(string, scanstat_string[i]); - strcat(string, SCANSTAT_WORD_SYSTEM); - strcat(string, SCANSTAT_WORD_HIERARCHY); - cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]); - } - return 0; -} - -static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp, - unsigned int event) -{ - struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); - - spin_lock(&mem->scanstat.lock); - memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats)); - memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats)); - spin_unlock(&mem->scanstat.lock); - return 0; -} - - static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -4827,11 +4673,6 @@ static struct cftype mem_cgroup_files[] = { .mode = S_IRUGO, }, #endif - { - .name = "vmscan_stat", - .read_map = mem_cgroup_vmscan_stat_read, - .trigger = mem_cgroup_reset_vmscan_stat, - }, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -5095,7 +4936,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) atomic_set(&mem->refcnt, 1); mem->move_charge_at_immigrate = 0; mutex_init(&mem->thresholds_lock); - spin_lock_init(&mem->scanstat.lock); return &mem->css; free_out: __mem_cgroup_free(mem); -- cgit v1.2.3