From 5d097056c9a017a3b720849efb5432f37acabbac Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:18:21 -0800 Subject: kmemcg: account certain kmem allocations to memcg Mark those kmem allocations that are known to be easily triggered from userspace as __GFP_ACCOUNT/SLAB_ACCOUNT, which makes them accounted to memcg. For the list, see below: - threadinfo - task_struct - task_delay_info - pid - cred - mm_struct - vm_area_struct and vm_region (nommu) - anon_vma and anon_vma_chain - signal_struct - sighand_struct - fs_struct - files_struct - fdtable and fdtable->full_fds_bits - dentry and external_name - inode for all filesystems. This is the most tedious part, because most filesystems overwrite the alloc_inode method. The list is far from complete, so feel free to add more objects. Nevertheless, it should be close to "account everything" approach and keep most workloads within bounds. Malevolent users will be able to breach the limit, but this was possible even with the former "account everything" approach (simply because it did not account everything in fact). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Greg Thelen Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 6774e6b2e96d..51915842f1c0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -300,9 +300,9 @@ void __init fork_init(void) #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES #endif /* create a slab on which task_structs can be allocated */ - task_struct_cachep = - kmem_cache_create("task_struct", arch_task_struct_size, - ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); + task_struct_cachep = kmem_cache_create("task_struct", + arch_task_struct_size, ARCH_MIN_TASKALIGN, + SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); #endif /* do the arch specific task caches init */ @@ -1848,16 +1848,19 @@ void __init proc_caches_init(void) sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| - SLAB_NOTRACK, sighand_ctor); + SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); /* * FIXME! The "sizeof(struct mm_struct)" currently includes the * whole struct cpumask for the OFFSTACK case. We could change @@ -1867,8 +1870,9 @@ void __init proc_caches_init(void) */ mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); + vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } -- cgit v1.2.3 From 84638335900f1995495838fe1bd4870c43ec1f67 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 14 Jan 2016 15:22:07 -0800 Subject: mm: rework virtual memory accounting When inspecting a vague code inside prctl(PR_SET_MM_MEM) call (which testing the RLIMIT_DATA value to figure out if we're allowed to assign new @start_brk, @brk, @start_data, @end_data from mm_struct) it's been commited that RLIMIT_DATA in a form it's implemented now doesn't do anything useful because most of user-space libraries use mmap() syscall for dynamic memory allocations. Linus suggested to convert RLIMIT_DATA rlimit into something suitable for anonymous memory accounting. But in this patch we go further, and the changes are bundled together as: * keep vma counting if CONFIG_PROC_FS=n, will be used for limits * replace mm->shared_vm with better defined mm->data_vm * account anonymous executable areas as executable * account file-backed growsdown/up areas as stack * drop struct file* argument from vm_stat_account * enforce RLIMIT_DATA for size of data areas This way code looks cleaner: now code/stack/data classification depends only on vm_flags state: VM_EXEC & ~VM_WRITE -> code (VmExe + VmLib in proc) VM_GROWSUP | VM_GROWSDOWN -> stack (VmStk) VM_WRITE & ~VM_SHARED & !stack -> data (VmData) The rest (VmSize - VmData - VmStk - VmExe - VmLib) could be called "shared", but that might be strange beast like readonly-private or VM_IO area. - RLIMIT_AS limits whole address space "VmSize" - RLIMIT_STACK limits stack "VmStk" (but each vma individually) - RLIMIT_DATA now limits "VmData" Signed-off-by: Konstantin Khlebnikov Signed-off-by: Cyrill Gorcunov Cc: Quentin Casasnovas Cc: Vegard Nossum Acked-by: Linus Torvalds Cc: Willy Tarreau Cc: Andy Lutomirski Cc: Kees Cook Cc: Vladimir Davydov Cc: Pavel Emelyanov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 51915842f1c0..2e391c754ae7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -414,7 +414,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); mm->total_vm = oldmm->total_vm; - mm->shared_vm = oldmm->shared_vm; + mm->data_vm = oldmm->data_vm; mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; @@ -433,8 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { - vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, - -vma_pages(mpnt)); + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } charge = 0; -- cgit v1.2.3