From 7a64bf05b2a6fe3703062d13d389e3eb904741c6 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:21:51 -0800 Subject: mm: add a __GFP_KMEMCG flag This flag is used to indicate to the callees that this allocation is a kernel allocation in process context, and should be accounted to current's memcg. Signed-off-by: Glauber Costa Acked-by: Johannes Weiner Acked-by: Rik van Riel Acked-by: Mel Gorman Acked-by: Kamezawa Hiroyuki Acked-by: Michal Hocko Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Suleiman Souhlal Cc: Tejun Heo Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 ++ include/trace/events/gfpflags.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f74856e17e48..643c9a6f7f34 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -30,6 +30,7 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u +#define ___GFP_KMEMCG 0x100000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u @@ -89,6 +90,7 @@ struct vm_area_struct; #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ +#define __GFP_KMEMCG ((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ /* diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index d6fd8e5b14b7..1eddbf1557f2 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -34,6 +34,7 @@ {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ + {(unsigned long)__GFP_KMEMCG, "GFP_KMEMCG"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ -- cgit v1.2.3 From 7ae1e1d0f8ac2927ed7e3ca6d15e42d485903459 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:21:56 -0800 Subject: memcg: kmem controller infrastructure Introduce infrastructure for tracking kernel memory pages to a given memcg. This will happen whenever the caller includes the flag __GFP_KMEMCG flag, and the task belong to a memcg other than the root. In memcontrol.h those functions are wrapped in inline acessors. The idea is to later on, patch those with static branches, so we don't incur any overhead when no mem cgroups with limited kmem are being used. Users of this functionality shall interact with the memcg core code through the following functions: memcg_kmem_newpage_charge: will return true if the group can handle the allocation. At this point, struct page is not yet allocated. memcg_kmem_commit_charge: will either revert the charge, if struct page allocation failed, or embed memcg information into page_cgroup. memcg_kmem_uncharge_page: called at free time, will revert the charge. Signed-off-by: Glauber Costa Acked-by: Michal Hocko Acked-by: Kamezawa Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Johannes Weiner Cc: Tejun Heo Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Mel Gorman Cc: Rik van Riel Cc: Suleiman Souhlal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 110 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e98a74c0c9c0..afa2ad40457e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -21,6 +21,7 @@ #define _LINUX_MEMCONTROL_H #include #include +#include struct mem_cgroup; struct page_cgroup; @@ -414,5 +415,114 @@ static inline void sock_release_memcg(struct sock *sk) { } #endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */ + +#ifdef CONFIG_MEMCG_KMEM +static inline bool memcg_kmem_enabled(void) +{ + return true; +} + +/* + * In general, we'll do everything in our power to not incur in any overhead + * for non-memcg users for the kmem functions. Not even a function call, if we + * can avoid it. + * + * Therefore, we'll inline all those functions so that in the best case, we'll + * see that kmemcg is off for everybody and proceed quickly. If it is on, + * we'll still do most of the flag checking inline. We check a lot of + * conditions, but because they are pretty simple, they are expected to be + * fast. + */ +bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, + int order); +void __memcg_kmem_commit_charge(struct page *page, + struct mem_cgroup *memcg, int order); +void __memcg_kmem_uncharge_pages(struct page *page, int order); + +/** + * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. + * @gfp: the gfp allocation flags. + * @memcg: a pointer to the memcg this was charged against. + * @order: allocation order. + * + * returns true if the memcg where the current task belongs can hold this + * allocation. + * + * We return true automatically if this allocation is not to be accounted to + * any memcg. + */ +static inline bool +memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) +{ + if (!memcg_kmem_enabled()) + return true; + + /* + * __GFP_NOFAIL allocations will move on even if charging is not + * possible. Therefore we don't even try, and have this allocation + * unaccounted. We could in theory charge it with + * res_counter_charge_nofail, but we hope those allocations are rare, + * and won't be worth the trouble. + */ + if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL)) + return true; + if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) + return true; + + /* If the test is dying, just let it go. */ + if (unlikely(fatal_signal_pending(current))) + return true; + + return __memcg_kmem_newpage_charge(gfp, memcg, order); +} + +/** + * memcg_kmem_uncharge_pages: uncharge pages from memcg + * @page: pointer to struct page being freed + * @order: allocation order. + * + * there is no need to specify memcg here, since it is embedded in page_cgroup + */ +static inline void +memcg_kmem_uncharge_pages(struct page *page, int order) +{ + if (memcg_kmem_enabled()) + __memcg_kmem_uncharge_pages(page, order); +} + +/** + * memcg_kmem_commit_charge: embeds correct memcg in a page + * @page: pointer to struct page recently allocated + * @memcg: the memcg structure we charged against + * @order: allocation order. + * + * Needs to be called after memcg_kmem_newpage_charge, regardless of success or + * failure of the allocation. if @page is NULL, this function will revert the + * charges. Otherwise, it will commit the memcg given by @memcg to the + * corresponding page_cgroup. + */ +static inline void +memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) +{ + if (memcg_kmem_enabled() && memcg) + __memcg_kmem_commit_charge(page, memcg, order); +} + +#else +static inline bool +memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) +{ + return true; +} + +static inline void memcg_kmem_uncharge_pages(struct page *page, int order) +{ +} + +static inline void +memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ -- cgit v1.2.3 From 6a1a0d3b625a4091e7a0eb249aefc6a644385149 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:00 -0800 Subject: mm: allocate kernel pages to the right memcg When a process tries to allocate a page with the __GFP_KMEMCG flag, the page allocator will call the corresponding memcg functions to validate the allocation. Tasks in the root memcg can always proceed. To avoid adding markers to the page - and a kmem flag that would necessarily follow, as much as doing page_cgroup lookups for no reason, whoever is marking its allocations with __GFP_KMEMCG flag is responsible for telling the page allocator that this is such an allocation at free_pages() time. This is done by the invocation of __free_accounted_pages() and free_accounted_pages(). Signed-off-by: Glauber Costa Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Kamezawa Hiroyuki Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Johannes Weiner Cc: Suleiman Souhlal Cc: Tejun Heo Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 643c9a6f7f34..0f615eb23d05 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -367,6 +367,9 @@ extern void free_pages(unsigned long addr, unsigned int order); extern void free_hot_cold_page(struct page *page, int cold); extern void free_hot_cold_page_list(struct list_head *list, int cold); +extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); +extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); + #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) -- cgit v1.2.3 From 50bdd430c20566b13d8bc59946184b08f5875de6 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:04 -0800 Subject: res_counter: return amount of charges after res_counter_uncharge() It is useful to know how many charges are still left after a call to res_counter_uncharge. While it is possible to issue a res_counter_read after uncharge, this can be racy. If we need, for instance, to take some action when the counters drop down to 0, only one of the callers should see it. This is the same semantics as the atomic variables in the kernel. Since the current return value is void, we don't need to worry about anything breaking due to this change: nobody relied on that, and only users appearing from now on will be checking this value. Signed-off-by: Glauber Costa Reviewed-by: Michal Hocko Acked-by: Kamezawa Hiroyuki Acked-by: David Rientjes Cc: Johannes Weiner Cc: Suleiman Souhlal Cc: Tejun Heo Cc: Christoph Lameter Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/res_counter.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index 6f54e40fa218..5ae8456d9670 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -125,14 +125,16 @@ int res_counter_charge_nofail(struct res_counter *counter, * * these calls check for usage underflow and show a warning on the console * _locked call expects the counter->lock to be taken + * + * returns the total charges still present in @counter. */ -void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); -void res_counter_uncharge(struct res_counter *counter, unsigned long val); +u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); +u64 res_counter_uncharge(struct res_counter *counter, unsigned long val); -void res_counter_uncharge_until(struct res_counter *counter, - struct res_counter *top, - unsigned long val); +u64 res_counter_uncharge_until(struct res_counter *counter, + struct res_counter *top, + unsigned long val); /** * res_counter_margin - calculate chargeable space of a counter * @cnt: the counter -- cgit v1.2.3 From a8964b9b84f99c0b1b5d7c09520f89f0700e742e Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:09 -0800 Subject: memcg: use static branches when code not in use We can use static branches to patch the code in or out when not used. Because the _ACTIVE bit on kmem_accounted is only set after the increment is done, we guarantee that the root memcg will always be selected for kmem charges until all call sites are patched (see memcg_kmem_enabled). This guarantees that no mischarges are applied. Static branch decrement happens when the last reference count from the kmem accounting in memcg dies. This will only happen when the charges drop down to 0. When that happens, we need to disable the static branch only on those memcgs that enabled it. To achieve this, we would be forced to complicate the code by keeping track of which memcgs were the ones that actually enabled limits, and which ones got it from its parents. It is a lot simpler just to do static_key_slow_inc() on every child that is accounted. Signed-off-by: Glauber Costa Acked-by: Michal Hocko Acked-by: Kamezawa Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Johannes Weiner Cc: Suleiman Souhlal Cc: Tejun Heo Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index afa2ad40457e..87d61e840ddd 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -22,6 +22,7 @@ #include #include #include +#include struct mem_cgroup; struct page_cgroup; @@ -417,9 +418,10 @@ static inline void sock_release_memcg(struct sock *sk) #endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */ #ifdef CONFIG_MEMCG_KMEM +extern struct static_key memcg_kmem_enabled_key; static inline bool memcg_kmem_enabled(void) { - return true; + return static_key_false(&memcg_kmem_enabled_key); } /* -- cgit v1.2.3 From 2ad306b17c0ac5a1b1f250d5f772aeb87fdf1eba Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:18 -0800 Subject: fork: protect architectures where THREAD_SIZE >= PAGE_SIZE against fork bombs Because those architectures will draw their stacks directly from the page allocator, rather than the slab cache, we can directly pass __GFP_KMEMCG flag, and issue the corresponding free_pages. This code path is taken when the architecture doesn't define CONFIG_ARCH_THREAD_INFO_ALLOCATOR (only ia64 seems to), and has THREAD_SIZE >= PAGE_SIZE. Luckily, most - if not all - of the remaining architectures fall in this category. This will guarantee that every stack page is accounted to the memcg the process currently lives on, and will have the allocations to fail if they go over limit. For the time being, I am defining a new variant of THREADINFO_GFP, not to mess with the other path. Once the slab is also tracked by memcg, we can get rid of that flag. Tested to successfully protect against :(){ :|:& };: Signed-off-by: Glauber Costa Acked-by: Frederic Weisbecker Acked-by: Kamezawa Hiroyuki Reviewed-by: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/thread_info.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index ccc1899bd62e..e7e04736802f 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -61,6 +61,8 @@ extern long do_no_restart_syscall(struct restart_block *parm); # define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK) #endif +#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG) + /* * flag set/clear/test wrappers * - pass TIF_xxxx constants to these functions -- cgit v1.2.3 From ba6c496ed834a37a26fc6fc87fc9aecb0fa0014d Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:27 -0800 Subject: slab/slub: struct memcg_params For the kmem slab controller, we need to record some extra information in the kmem_cache structure. Signed-off-by: Glauber Costa Signed-off-by: Suleiman Souhlal Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 24 ++++++++++++++++++++++++ include/linux/slab_def.h | 3 +++ include/linux/slub_def.h | 3 +++ 3 files changed, 30 insertions(+) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 743a10415122..00efba149222 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -176,6 +176,30 @@ void kmem_cache_free(struct kmem_cache *, void *); #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif +/* + * This is the main placeholder for memcg-related information in kmem caches. + * struct kmem_cache will hold a pointer to it, so the memory cost while + * disabled is 1 pointer. The runtime cost while enabled, gets bigger than it + * would otherwise be if that would be bundled in kmem_cache: we'll need an + * extra pointer chase. But the trade off clearly lays in favor of not + * penalizing non-users. + * + * Both the root cache and the child caches will have it. For the root cache, + * this will hold a dynamically allocated array large enough to hold + * information about the currently limited memcgs in the system. + * + * Child caches will hold extra metadata needed for its operation. Fields are: + * + * @memcg: pointer to the memcg this cache belongs to + */ +struct memcg_cache_params { + bool is_root_cache; + union { + struct kmem_cache *memcg_caches[0]; + struct mem_cgroup *memcg; + }; +}; + /* * Common kmalloc functions provided by all allocators */ diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 45c0356fdc8c..8bb6e0eaf3c6 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -81,6 +81,9 @@ struct kmem_cache { */ int obj_offset; #endif /* CONFIG_DEBUG_SLAB */ +#ifdef CONFIG_MEMCG_KMEM + struct memcg_cache_params *memcg_params; +#endif /* 6) per-cpu/per-node data, touched during every alloc/free */ /* diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index df448adb7283..961e72eab907 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -101,6 +101,9 @@ struct kmem_cache { #ifdef CONFIG_SYSFS struct kobject kobj; /* For sysfs */ #endif +#ifdef CONFIG_MEMCG_KMEM + struct memcg_cache_params *memcg_params; +#endif #ifdef CONFIG_NUMA /* -- cgit v1.2.3 From 2633d7a028239a738b793be5ca8fa6ac312f5793 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:34 -0800 Subject: slab/slub: consider a memcg parameter in kmem_create_cache Allow a memcg parameter to be passed during cache creation. When the slub allocator is being used, it will only merge caches that belong to the same memcg. We'll do this by scanning the global list, and then translating the cache to a memcg-specific cache Default function is created as a wrapper, passing NULL to the memcg version. We only merge caches that belong to the same memcg. A helper is provided, memcg_css_id: because slub needs a unique cache name for sysfs. Since this is visible, but not the canonical location for slab data, the cache name is not used, the css_id should suffice. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 26 ++++++++++++++++++++++++++ include/linux/slab.h | 14 +++++++++++++- 2 files changed, 39 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 87d61e840ddd..0b69a0470007 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -28,6 +28,7 @@ struct mem_cgroup; struct page_cgroup; struct page; struct mm_struct; +struct kmem_cache; /* Stats that can be updated by kernel. */ enum mem_cgroup_page_stat_item { @@ -441,6 +442,11 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order); void __memcg_kmem_uncharge_pages(struct page *page, int order); +int memcg_cache_id(struct mem_cgroup *memcg); +int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s); +void memcg_release_cache(struct kmem_cache *cachep); +void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep); + /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. * @gfp: the gfp allocation flags. @@ -525,6 +531,26 @@ static inline void memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) { } + +static inline int memcg_cache_id(struct mem_cgroup *memcg) +{ + return -1; +} + +static inline int memcg_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *s) +{ + return 0; +} + +static inline void memcg_release_cache(struct kmem_cache *cachep) +{ +} + +static inline void memcg_cache_list_add(struct mem_cgroup *memcg, + struct kmem_cache *s) +{ +} #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/slab.h b/include/linux/slab.h index 00efba149222..c0fcf28c15b2 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -116,6 +116,7 @@ struct kmem_cache { }; #endif +struct mem_cgroup; /* * struct kmem_cache related prototypes */ @@ -125,6 +126,9 @@ int slab_is_available(void); struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, void (*)(void *)); +struct kmem_cache * +kmem_cache_create_memcg(struct mem_cgroup *, const char *, size_t, size_t, + unsigned long, void (*)(void *)); void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); void kmem_cache_free(struct kmem_cache *, void *); @@ -191,15 +195,23 @@ void kmem_cache_free(struct kmem_cache *, void *); * Child caches will hold extra metadata needed for its operation. Fields are: * * @memcg: pointer to the memcg this cache belongs to + * @list: list_head for the list of all caches in this memcg + * @root_cache: pointer to the global, root cache, this cache was derived from */ struct memcg_cache_params { bool is_root_cache; union { struct kmem_cache *memcg_caches[0]; - struct mem_cgroup *memcg; + struct { + struct mem_cgroup *memcg; + struct list_head list; + struct kmem_cache *root_cache; + }; }; }; +int memcg_update_all_caches(int num_memcgs); + /* * Common kmalloc functions provided by all allocators */ -- cgit v1.2.3 From 55007d849759252ddd573aeb36143b947202d509 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:38 -0800 Subject: memcg: allocate memory for memcg caches whenever a new memcg appears Every cache that is considered a root cache (basically the "original" caches, tied to the root memcg/no-memcg) will have an array that should be large enough to store a cache pointer per each memcg in the system. Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently in the 64k pointers range. Most of the time, we won't be using that much. What goes in this patch, is a simple scheme to dynamically allocate such an array, in order to minimize memory usage for memcg caches. Because we would also like to avoid allocations all the time, at least for now, the array will only grow. It will tend to be big enough to hold the maximum number of kmem-limited memcgs ever achieved. We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have more than that, we'll start doubling the size of this array every time the limit is reached. Because we are only considering kmem limited memcgs, a natural point for this to happen is when we write to the limit. At that point, we already have set_limit_mutex held, so that will become our natural synchronization mechanism. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0b69a0470007..45085e14e023 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -447,6 +447,8 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s); void memcg_release_cache(struct kmem_cache *cachep); void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep); +int memcg_update_cache_size(struct kmem_cache *s, int num_groups); +void memcg_update_array_size(int num_groups); /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. * @gfp: the gfp allocation flags. -- cgit v1.2.3 From d7f25f8a2f81252d1ac134470ba1d0a287cf8fcd Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:40 -0800 Subject: memcg: infrastructure to match an allocation to the right cache The page allocator is able to bind a page to a memcg when it is allocated. But for the caches, we'd like to have as many objects as possible in a page belonging to the same cache. This is done in this patch by calling memcg_kmem_get_cache in the beginning of every allocation function. This function is patched out by static branches when kernel memory controller is not being used. It assumes that the task allocating, which determines the memcg in the page allocator, belongs to the same cgroup throughout the whole process. Misaccounting can happen if the task calls memcg_kmem_get_cache() while belonging to a cgroup, and later on changes. This is considered acceptable, and should only happen upon task migration. Before the cache is created by the memcg core, there is also a possible imbalance: the task belongs to a memcg, but the cache being allocated from is the global cache, since the child cache is not yet guaranteed to be ready. This case is also fine, since in this case the GFP_KMEMCG will not be passed and the page allocator will not attempt any cgroup accounting. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 45085e14e023..bd9b5d73bc2b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -449,6 +449,10 @@ void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep); int memcg_update_cache_size(struct kmem_cache *s, int num_groups); void memcg_update_array_size(int num_groups); + +struct kmem_cache * +__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); + /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. * @gfp: the gfp allocation flags. @@ -518,6 +522,37 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) __memcg_kmem_commit_charge(page, memcg, order); } +/** + * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation + * @cachep: the original global kmem cache + * @gfp: allocation flags. + * + * This function assumes that the task allocating, which determines the memcg + * in the page allocator, belongs to the same cgroup throughout the whole + * process. Misacounting can happen if the task calls memcg_kmem_get_cache() + * while belonging to a cgroup, and later on changes. This is considered + * acceptable, and should only happen upon task migration. + * + * Before the cache is created by the memcg core, there is also a possible + * imbalance: the task belongs to a memcg, but the cache being allocated from + * is the global cache, since the child cache is not yet guaranteed to be + * ready. This case is also fine, since in this case the GFP_KMEMCG will not be + * passed and the page allocator will not attempt any cgroup accounting. + */ +static __always_inline struct kmem_cache * +memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) +{ + if (!memcg_kmem_enabled()) + return cachep; + if (gfp & __GFP_NOFAIL) + return cachep; + if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) + return cachep; + if (unlikely(fatal_signal_pending(current))) + return cachep; + + return __memcg_kmem_get_cache(cachep, gfp); +} #else static inline bool memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) @@ -553,6 +588,12 @@ static inline void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *s) { } + +static inline struct kmem_cache * +memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) +{ + return cachep; +} #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ -- cgit v1.2.3 From 0e9d92f2d02d8c8320f0502307c688d07bdac2b3 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:42 -0800 Subject: memcg: skip memcg kmem allocations in specified code regions Create a mechanism that skip memcg allocations during certain pieces of our core code. It basically works in the same way as preempt_disable()/preempt_enable(): By marking a region under which all allocations will be accounted to the root memcg. We need this to prevent races in early cache creation, when we allocate data using caches that are not necessarily created already. Signed-off-by: Glauber Costa yCc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9914c662ed7b..f712465b05c5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1597,6 +1597,7 @@ struct task_struct { unsigned long nr_pages; /* uncharged usage */ unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ } memcg_batch; + unsigned int memcg_kmem_skip_account; #endif #ifdef CONFIG_HAVE_HW_BREAKPOINT atomic_t ptrace_bp_refcnt; -- cgit v1.2.3 From b9ce5ef49f00daf2254c6953c8d31f79aabccd34 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:46 -0800 Subject: sl[au]b: always get the cache from its page in kmem_cache_free() struct page already has this information. If we start chaining caches, this information will always be more trustworthy than whatever is passed into the function. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bd9b5d73bc2b..2298122e71ad 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -554,6 +554,11 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) return __memcg_kmem_get_cache(cachep, gfp); } #else +static inline bool memcg_kmem_enabled(void) +{ + return false; +} + static inline bool memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) { -- cgit v1.2.3 From d79923fad95b0cdf7770e024677180c734cb7148 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:48 -0800 Subject: sl[au]b: allocate objects from memcg cache We are able to match a cache allocation to a particular memcg. If the task doesn't change groups during the allocation itself - a rare event, this will give us a good picture about who is the first group to touch a cache page. This patch uses the now available infrastructure by calling memcg_kmem_get_cache() before all the cache allocations. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 961e72eab907..364ba6c9fe21 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -225,7 +225,10 @@ void *__kmalloc(size_t size, gfp_t flags); static __always_inline void * kmalloc_order(size_t size, gfp_t flags, unsigned int order) { - void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order); + void *ret; + + flags |= (__GFP_COMP | __GFP_KMEMCG); + ret = (void *) __get_free_pages(flags, order); kmemleak_alloc(ret, size, 1, flags); return ret; } -- cgit v1.2.3 From 1f458cbf122288b23620ee822e19bcbb76c8d6ec Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:50 -0800 Subject: memcg: destroy memcg caches Implement destruction of memcg caches. Right now, only caches where our reference counter is the last remaining are deleted. If there are any other reference counters around, we just leave the caches lying around until they go away. When that happens, a destruction function is called from the cache code. Caches are only destroyed in process context, so we queue them up for later processing in the general case. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 ++ include/linux/slab.h | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2298122e71ad..79fcf0cd7186 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -453,6 +453,8 @@ void memcg_update_array_size(int num_groups); struct kmem_cache * __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); +void mem_cgroup_destroy_cache(struct kmem_cache *cachep); + /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. * @gfp: the gfp allocation flags. diff --git a/include/linux/slab.h b/include/linux/slab.h index c0fcf28c15b2..869efb8d2377 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -11,6 +11,8 @@ #include #include +#include + /* * Flags to pass to kmem_cache_create(). @@ -179,7 +181,6 @@ void kmem_cache_free(struct kmem_cache *, void *); #ifndef ARCH_SLAB_MINALIGN #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif - /* * This is the main placeholder for memcg-related information in kmem caches. * struct kmem_cache will hold a pointer to it, so the memory cost while @@ -197,6 +198,10 @@ void kmem_cache_free(struct kmem_cache *, void *); * @memcg: pointer to the memcg this cache belongs to * @list: list_head for the list of all caches in this memcg * @root_cache: pointer to the global, root cache, this cache was derived from + * @dead: set to true after the memcg dies; the cache may still be around. + * @nr_pages: number of pages that belongs to this cache. + * @destroy: worker to be called whenever we are ready, or believe we may be + * ready, to destroy this cache. */ struct memcg_cache_params { bool is_root_cache; @@ -206,6 +211,9 @@ struct memcg_cache_params { struct mem_cgroup *memcg; struct list_head list; struct kmem_cache *root_cache; + bool dead; + atomic_t nr_pages; + struct work_struct destroy; }; }; }; -- cgit v1.2.3 From 7cf2798240a2a2230cb16a391beef98d8a7ad362 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:55 -0800 Subject: memcg/sl[au]b: track all the memcg children of a kmem_cache This enables us to remove all the children of a kmem_cache being destroyed, if for example the kernel module it's being used in gets unloaded. Otherwise, the children will still point to the destroyed parent. Signed-off-by: Suleiman Souhlal Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 79fcf0cd7186..e119f3ef793c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -454,6 +454,7 @@ struct kmem_cache * __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); void mem_cgroup_destroy_cache(struct kmem_cache *cachep); +void kmem_cache_destroy_memcg_children(struct kmem_cache *s); /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. @@ -601,6 +602,10 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { return cachep; } + +static inline void kmem_cache_destroy_memcg_children(struct kmem_cache *s) +{ +} #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ -- cgit v1.2.3 From 749c54151a6e5b229e4ae067dbc651e54b161fbc Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:23:01 -0800 Subject: memcg: aggregate memcg cache values in slabinfo When we create caches in memcgs, we need to display their usage information somewhere. We'll adopt a scheme similar to /proc/meminfo, with aggregate totals shown in the global file, and per-group information stored in the group itself. For the time being, only reads are allowed in the per-group cache. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 ++++++++ include/linux/slab.h | 4 ++++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e119f3ef793c..8dc7c746b44f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -420,6 +420,11 @@ static inline void sock_release_memcg(struct sock *sk) #ifdef CONFIG_MEMCG_KMEM extern struct static_key memcg_kmem_enabled_key; + +extern int memcg_limited_groups_array_size; +#define for_each_memcg_cache_index(_idx) \ + for ((_idx) = 0; i < memcg_limited_groups_array_size; (_idx)++) + static inline bool memcg_kmem_enabled(void) { return static_key_false(&memcg_kmem_enabled_key); @@ -557,6 +562,9 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) return __memcg_kmem_get_cache(cachep, gfp); } #else +#define for_each_memcg_cache_index(_idx) \ + for (; NULL; ) + static inline bool memcg_kmem_enabled(void) { return false; diff --git a/include/linux/slab.h b/include/linux/slab.h index 869efb8d2377..b9278663f22a 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -220,6 +220,10 @@ struct memcg_cache_params { int memcg_update_all_caches(int num_memcgs); +struct seq_file; +int cache_show(struct kmem_cache *s, struct seq_file *m); +void print_slabinfo_header(struct seq_file *m); + /* * Common kmalloc functions provided by all allocators */ -- cgit v1.2.3 From 943a451a87d229ca564a27274b58eaeae35fde5d Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:23:03 -0800 Subject: slab: propagate tunable values SLAB allows us to tune a particular cache behavior with tunables. When creating a new memcg cache copy, we'd like to preserve any tunables the parent cache already had. This could be done by an explicit call to do_tune_cpucache() after the cache is created. But this is not very convenient now that the caches are created from common code, since this function is SLAB-specific. Another method of doing that is taking advantage of the fact that do_tune_cpucache() is always called from enable_cpucache(), which is called at cache initialization. We can just preset the values, and then things work as expected. It can also happen that a root cache has its tunables updated during normal system operation. In this case, we will propagate the change to all caches that are already active. This change will require us to move the assignment of root_cache in memcg_params a bit earlier. We need this to be already set - which memcg_kmem_register_cache will do - when we reach __kmem_cache_create() Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 +++++--- include/linux/slab.h | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8dc7c746b44f..ea02ff970836 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -448,7 +448,8 @@ void __memcg_kmem_commit_charge(struct page *page, void __memcg_kmem_uncharge_pages(struct page *page, int order); int memcg_cache_id(struct mem_cgroup *memcg); -int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s); +int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, + struct kmem_cache *root_cache); void memcg_release_cache(struct kmem_cache *cachep); void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep); @@ -590,8 +591,9 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return -1; } -static inline int memcg_register_cache(struct mem_cgroup *memcg, - struct kmem_cache *s) +static inline int +memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, + struct kmem_cache *root_cache) { return 0; } diff --git a/include/linux/slab.h b/include/linux/slab.h index b9278663f22a..5d168d7e0a28 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -130,7 +130,7 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, void (*)(void *)); struct kmem_cache * kmem_cache_create_memcg(struct mem_cgroup *, const char *, size_t, size_t, - unsigned long, void (*)(void *)); + unsigned long, void (*)(void *), struct kmem_cache *); void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); void kmem_cache_free(struct kmem_cache *, void *); -- cgit v1.2.3 From 107dab5c92d5f9c3afe962036e47c207363255c7 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:23:05 -0800 Subject: slub: slub-specific propagation changes SLUB allows us to tune a particular cache behavior with sysfs-based tunables. When creating a new memcg cache copy, we'd like to preserve any tunables the parent cache already had. This can be done by tapping into the store attribute function provided by the allocator. We of course don't need to mess with read-only fields. Since the attributes can have multiple types and are stored internally by sysfs, the best strategy is to issue a ->show() in the root cache, and then ->store() in the memcg cache. The drawback of that, is that sysfs can allocate up to a page in buffering for show(), that we are likely not to need, but also can't guarantee. To avoid always allocating a page for that, we can update the caches at store time with the maximum attribute size ever stored to the root cache. We will then get a buffer big enough to hold it. The corolary to this, is that if no stores happened, nothing will be propagated. It can also happen that a root cache has its tunables updated during normal system operation. In this case, we will propagate the change to all caches that are already active. [akpm@linux-foundation.org: tweak code to avoid __maybe_unused] Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 364ba6c9fe21..9db4825cd393 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -103,6 +103,7 @@ struct kmem_cache { #endif #ifdef CONFIG_MEMCG_KMEM struct memcg_cache_params *memcg_params; + int max_attr_size; /* for propagation, maximum size of a stored attr */ #endif #ifdef CONFIG_NUMA -- cgit v1.2.3 From ebe945c27628fca03723582eba138acc2e2f3d15 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:23:10 -0800 Subject: memcg: add comments clarifying aspects of cache attribute propagation This patch clarifies two aspects of cache attribute propagation. First, the expected context for the for_each_memcg_cache macro in memcontrol.h. The usages already in the codebase are safe. In mm/slub.c, it is trivially safe because the lock is acquired right before the loop. In mm/slab.c, it is less so: the lock is acquired by an outer function a few steps back in the stack, so a VM_BUG_ON() is added to make sure it is indeed safe. A comment is also added to detail why we are returning the value of the parent cache and ignoring the children's when we propagate the attributes. Signed-off-by: Glauber Costa Cc: Michal Hocko Cc: Kamezawa Hiroyuki Cc: Johannes Weiner Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ea02ff970836..0108a56f814e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -422,6 +422,12 @@ static inline void sock_release_memcg(struct sock *sk) extern struct static_key memcg_kmem_enabled_key; extern int memcg_limited_groups_array_size; + +/* + * Helper macro to loop through all memcg-specific caches. Callers must still + * check if the cache is valid (it is either valid or NULL). + * the slab_mutex must be held when looping through those caches + */ #define for_each_memcg_cache_index(_idx) \ for ((_idx) = 0; i < memcg_limited_groups_array_size; (_idx)++) -- cgit v1.2.3 From 7179e7bf4592ac5a7b30257a7df6259ee81e51da Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Tue, 18 Dec 2012 14:23:19 -0800 Subject: mm/hugetlb: create hugetlb cgroup file in hugetlb_init Build kernel with CONFIG_HUGETLBFS=y,CONFIG_HUGETLB_PAGE=y and CONFIG_CGROUP_HUGETLB=y, then specify hugepagesz=xx boot option, system will fail to boot. This failure is caused by following code path: setup_hugepagesz hugetlb_add_hstate hugetlb_cgroup_file_init cgroup_add_cftypes kzalloc <--slab is *not available* yet For this path, slab is not available yet, so memory allocated will be failed, and cause WARN_ON() in hugetlb_cgroup_file_init(). So I move hugetlb_cgroup_file_init() into hugetlb_init(). [akpm@linux-foundation.org: tweak coding-style, remove pointless __init on inlined function] [akpm@linux-foundation.org: fix warning] Signed-off-by: Jianguo Wu Signed-off-by: Jiang Liu Reviewed-by: Aneesh Kumar K.V Acked-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb_cgroup.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index d73878c694b3..ce8217f7b5c2 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -62,7 +62,7 @@ extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page); extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg); -extern int hugetlb_cgroup_file_init(int idx) __init; +extern void hugetlb_cgroup_file_init(void) __init; extern void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage); @@ -111,9 +111,8 @@ hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, return; } -static inline int __init hugetlb_cgroup_file_init(int idx) +static inline void hugetlb_cgroup_file_init(void) { - return 0; } static inline void hugetlb_cgroup_migrate(struct page *oldhpage, -- cgit v1.2.3