From fceda1bf498677501befc7da72fd2e4de7f18466 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 1 Feb 2011 15:52:30 -0800 Subject: memsw: handle swapaccount kernel parameter correctly __setup based kernel command line parameters handlers which are handled in obsolete_checksetup are provided with the parameter value including = (more precisely everything right after the parameter name). This means that the current implementation of swapaccount[=1|0] doesn't work at all because if there is a value for the parameter then we are testing for "0" resp. "1" but we are getting "=0" resp. "=1" and if there is no parameter value we are getting an empty string rather than NULL. The original noswapccount parameter, which doesn't care about the value, works correctly. Signed-off-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3878cfe399dc..44f9f9c89f0c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5024,9 +5024,9 @@ struct cgroup_subsys mem_cgroup_subsys = { static int __init enable_swap_account(char *s) { /* consider enabled if no parameter or 1 is given */ - if (!s || !strcmp(s, "1")) + if (!(*s) || !strcmp(s, "=1")) really_do_swap_account = 1; - else if (!strcmp(s, "0")) + else if (!strcmp(s, "=0")) really_do_swap_account = 0; return 1; } @@ -5034,7 +5034,7 @@ __setup("swapaccount", enable_swap_account); static int __init disable_swap_account(char *s) { - enable_swap_account("0"); + enable_swap_account("=0"); return 1; } __setup("noswapaccount", disable_swap_account); -- cgit v1.2.3 From 552b372ba9db85751e7db2998f07cca2e51f5865 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 1 Feb 2011 15:52:31 -0800 Subject: memsw: deprecate noswapaccount kernel parameter and schedule it for removal noswapaccount couldn't be used to control memsw for both on/off cases so we have added swapaccount[=0|1] parameter. This way we can turn the feature in two ways noswapaccount resp. swapaccount=0. We have kept the original noswapaccount but I think we should remove it after some time as it just makes more command line parameters without any advantages and also the code to handle parameters is uglier if we want both parameters. Signed-off-by: Michal Hocko Requested-by: KAMEZAWA Hiroyuki Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 44f9f9c89f0c..79abb1fd39d2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5034,6 +5034,7 @@ __setup("swapaccount", enable_swap_account); static int __init disable_swap_account(char *s) { + printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n"); enable_swap_account("=0"); return 1; } -- cgit v1.2.3 From 9221edb7120e2dc3ae90f1c58514979f7ba40e46 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 1 Feb 2011 15:52:42 -0800 Subject: memcg: prevent endless loop when charging huge pages The charging code can encounter a charge size that is bigger than a regular page in two situations: one is a batched charge to fill the per-cpu stocks, the other is a huge page charge. This code is distributed over two functions, however, and only the outer one is aware of huge pages. In case the charging fails, the inner function will tell the outer function to retry if the charge size is bigger than regular pages--assuming batched charging is the only case. And the outer function will retry forever charging a huge page. This patch makes sure the inner function can distinguish between batch charging and a single huge page charge. It will only signal another attempt if batch charging failed, and go into regular reclaim when it is called on behalf of a huge page. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 79abb1fd39d2..50eb50e100fd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1837,8 +1837,15 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); - - if (csize > PAGE_SIZE) /* change csize and retry */ + /* + * csize can be either a huge page (HPAGE_SIZE), a batch of + * regular pages (CHARGE_SIZE), or a single regular page + * (PAGE_SIZE). + * + * Never reclaim on behalf of optional batching, retry with a + * single page instead. + */ + if (csize == CHARGE_SIZE) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) -- cgit v1.2.3 From 19942822df65ee4a47c2e6d6d70cace1b7f01710 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 1 Feb 2011 15:52:43 -0800 Subject: memcg: prevent endless loop when charging huge pages to near-limit group If reclaim after a failed charging was unsuccessful, the limits are checked again, just in case they settled by means of other tasks. This is all fine as long as every charge is of size PAGE_SIZE, because in that case, being below the limit means having at least PAGE_SIZE bytes available. But with transparent huge pages, we may end up in an endless loop where charging and reclaim fail, but we keep going because the limits are not yet exceeded, although not allowing for a huge page. Fix this up by explicitely checking for enough room, not just whether we are within limits. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 50eb50e100fd..0e81eb5f0aea 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1111,6 +1111,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) return false; } +/** + * mem_cgroup_check_margin - check if the memory cgroup allows charging + * @mem: memory cgroup to check + * @bytes: the number of bytes the caller intends to charge + * + * Returns a boolean value on whether @mem can be charged @bytes or + * whether this would exceed the limit. + */ +static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) +{ + if (!res_counter_check_margin(&mem->res, bytes)) + return false; + if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) + return false; + return true; +} + static unsigned int get_swappiness(struct mem_cgroup *memcg) { struct cgroup *cgrp = memcg->css.cgroup; @@ -1852,15 +1869,19 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, return CHARGE_WOULDBLOCK; ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags); + gfp_mask, flags); + if (mem_cgroup_check_margin(mem_over_limit, csize)) + return CHARGE_RETRY; /* - * try_to_free_mem_cgroup_pages() might not give us a full - * picture of reclaim. Some pages are reclaimed and might be - * moved to swap cache or just unmapped from the cgroup. - * Check the limit again to see if the reclaim reduced the - * current usage of the cgroup before giving up + * Even though the limit is exceeded at this point, reclaim + * may have been able to free some pages. Retry the charge + * before killing the task. + * + * Only for regular pages, though: huge pages are rather + * unlikely to succeed so close to the limit, and we fall back + * to regular pages anyway in case of failure. */ - if (ret || mem_cgroup_check_under_limit(mem_over_limit)) + if (csize == PAGE_SIZE && ret) return CHARGE_RETRY; /* -- cgit v1.2.3 From 8493ae439f7038b502df1d687e61dde54c27ca92 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 1 Feb 2011 15:52:44 -0800 Subject: memcg: never OOM when charging huge pages Huge page coverage should obviously have less priority than the continued execution of a process. Never kill a process when charging it a huge page fails. Instead, give up after the first failed reclaim attempt and fall back to regular pages. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0e81eb5f0aea..fc75f34ba609 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2351,13 +2351,19 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, enum charge_type ctype) { struct mem_cgroup *mem = NULL; + int page_size = PAGE_SIZE; struct page_cgroup *pc; + bool oom = true; int ret; - int page_size = PAGE_SIZE; if (PageTransHuge(page)) { page_size <<= compound_order(page); VM_BUG_ON(!PageTransHuge(page)); + /* + * Never OOM-kill a process for a huge page. The + * fault handler will fall back to regular pages. + */ + oom = false; } pc = lookup_page_cgroup(page); @@ -2366,7 +2372,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, return 0; prefetchw(pc); - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); if (ret || !mem) return ret; -- cgit v1.2.3 From 3751d60430fe4c26460a5ca8ad8672d32f93bcb1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 1 Feb 2011 15:52:45 -0800 Subject: memcg: fix event counting breakage from recent THP update Changes in e401f1761 ("memcg: modify accounting function for supporting THP better") adds nr_pages to support multiple page size in memory_cgroup_charge_statistics. But counting the number of event nees abs(nr_pages) for increasing counters. This patch fixes event counting. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Johannes Weiner Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fc75f34ba609..da53a252b259 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -612,8 +612,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); - else + else { __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); + nr_pages = -nr_pages; /* for event */ + } __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); -- cgit v1.2.3