From 0df9d41ab5d43dc5b20abc8b22a6b6d098b03994 Mon Sep 17 00:00:00 2001 From: Yigal Korman Date: Mon, 16 Nov 2015 14:09:15 +0200 Subject: mm, dax: fix DAX deadlocks (COW fault) DAX handling of COW faults has wrong locking sequence: dax_fault does i_mmap_lock_read do_cow_fault does i_mmap_unlock_write Ross's commit[1] missed a fix[2] that Kirill added to Matthew's commit[3]. Original COW locking logic was introduced by Matthew here[4]. This should be applied to v4.3 as well. [1] 0f90cc6609c7 mm, dax: fix DAX deadlocks [2] 52a2b53ffde6 mm, dax: use i_mmap_unlock_write() in do_cow_fault() [3] 843172978bb9 dax: fix race between simultaneous faults [4] 2e4cdab0584f mm: allow page fault handlers to perform the COW Cc: Cc: Boaz Harrosh Cc: Alexander Viro Cc: Dave Chinner Cc: Jan Kara Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Acked-by: Ross Zwisler Signed-off-by: Yigal Korman Signed-off-by: Dan Williams --- mm/memory.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index deb679c31f2a..c387430f06c3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3015,9 +3015,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else { /* * The fault handler has no page to lock, so it holds - * i_mmap_lock for write to protect against truncate. + * i_mmap_lock for read to protect against truncate. */ - i_mmap_unlock_write(vma->vm_file->f_mapping); + i_mmap_unlock_read(vma->vm_file->f_mapping); } goto uncharge_out; } @@ -3031,9 +3031,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else { /* * The fault handler has no page to lock, so it holds - * i_mmap_lock for write to protect against truncate. + * i_mmap_lock for read to protect against truncate. */ - i_mmap_unlock_write(vma->vm_file->f_mapping); + i_mmap_unlock_read(vma->vm_file->f_mapping); } return ret; uncharge_out: -- cgit v1.2.3 From 7511c3ede752e6dd67df20779b4e11effe102637 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Fri, 20 Nov 2015 15:57:02 -0800 Subject: mm: vmalloc: don't remove inexistent guard hole in remove_vm_area() Commit 71394fe50146 ("mm: vmalloc: add flag preventing guard hole allocation") missed a spot. Currently remove_vm_area() decreases vm->size to "remove" the guard hole page, even when it isn't present. All but one users just free the vm_struct rigth away and never access vm->size anyway. Don't touch the size in remove_vm_area() and have __vunmap() use the proper get_vm_area_size() helper. Signed-off-by: Jerome Marchand Acked-by: Andrey Ryabinin Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d04563480c94..8e3c9c5a3042 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1443,7 +1443,6 @@ struct vm_struct *remove_vm_area(const void *addr) vmap_debug_free_range(va->va_start, va->va_end); kasan_free_shadow(vm); free_unmap_vmap_area(va); - vm->size -= PAGE_SIZE; return vm; } @@ -1468,8 +1467,8 @@ static void __vunmap(const void *addr, int deallocate_pages) return; } - debug_check_no_locks_freed(addr, area->size); - debug_check_no_obj_freed(addr, area->size); + debug_check_no_locks_freed(addr, get_vm_area_size(area)); + debug_check_no_obj_freed(addr, get_vm_area_size(area)); if (deallocate_pages) { int i; -- cgit v1.2.3 From 1a763615688b891246c5b0a932d7a95fea4c1a68 Mon Sep 17 00:00:00 2001 From: "Jason J. Herne" Date: Fri, 20 Nov 2015 15:57:04 -0800 Subject: mm: loosen MADV_NOHUGEPAGE to enable Qemu postcopy on s390 MADV_NOHUGEPAGE processing is too restrictive. kvm already disables hugepage but hugepage_madvise() takes the error path when we ask to turn on the MADV_NOHUGEPAGE bit and the bit is already on. This causes Qemu's new postcopy migration feature to fail on s390 because its first action is to madvise the guest address space as NOHUGEPAGE. This patch modifies the code so that the operation succeeds without error now. For consistency reasons do the same for MADV_HUGEPAGE. Signed-off-by: Jason J. Herne Reviewed-by: Andrea Arcangeli Acked-by: Christian Borntraeger Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c29ddebc8705..62fe06bb7d04 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2009,7 +2009,7 @@ int hugepage_madvise(struct vm_area_struct *vma, /* * Be somewhat over-protective like KSM for now! */ - if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) + if (*vm_flags & VM_NO_THP) return -EINVAL; *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; @@ -2025,7 +2025,7 @@ int hugepage_madvise(struct vm_area_struct *vma, /* * Be somewhat over-protective like KSM for now! */ - if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP)) + if (*vm_flags & VM_NO_THP) return -EINVAL; *vm_flags &= ~VM_HUGEPAGE; *vm_flags |= VM_NOHUGEPAGE; -- cgit v1.2.3 From 50e55bf626ad3ebbca45c0c0d03eb1710a139638 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 20 Nov 2015 15:57:10 -0800 Subject: mm/page-writeback.c: initialize m_dirty to avoid compile warning When building kernel with gcc 5.2, the below warning is raised: mm/page-writeback.c: In function 'balance_dirty_pages.isra.10': mm/page-writeback.c:1545:17: warning: 'm_dirty' may be used uninitialized in this function [-Wmaybe-uninitialized] unsigned long m_dirty, m_thresh, m_bg_thresh; The m_dirty{thresh, bg_thresh} are initialized in the block of "if (mdtc)", so if mdts is null, they won't be initialized before being used. Initialize m_dirty to zero, also initialize m_thresh and m_bg_thresh to keep consistency. They are used later by if condition: !mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh) If mdtc is null, dirty_freerun_ceiling will not be called at all, so the initialization will not change any behavior other than just ceasing the compile warning. (akpm: the patch actually reduces .text size by ~20 bytes on gcc-4.x.y) [akpm@linux-foundation.org: add comment] Signed-off-by: Yang Shi Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2c90357c34ea..3e4d65445fa7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1542,7 +1542,9 @@ static void balance_dirty_pages(struct address_space *mapping, for (;;) { unsigned long now = jiffies; unsigned long dirty, thresh, bg_thresh; - unsigned long m_dirty, m_thresh, m_bg_thresh; + unsigned long m_dirty = 0; /* stop bogus uninit warnings */ + unsigned long m_thresh = 0; + unsigned long m_bg_thresh = 0; /* * Unstable writes are a feature of certain networked -- cgit v1.2.3 From 459372545c9c0d6f491e280dccc8a54a61b60e56 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 20 Nov 2015 15:57:18 -0800 Subject: kasan: fix kmemleak false-positive in kasan_module_alloc() Kmemleak reports the following leak: unreferenced object 0xfffffbfff41ea000 (size 20480): comm "modprobe", pid 65199, jiffies 4298875551 (age 542.568s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x4e/0xc0 [] __vmalloc_node_range+0x4b8/0x740 [] kasan_module_alloc+0x72/0xc0 [] module_alloc+0x78/0xb0 [] module_alloc_update_bounds+0x14/0x70 [] layout_and_allocate+0x16f4/0x3c90 [] load_module+0x2ff/0x6690 [] SyS_finit_module+0x136/0x170 [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff kasan_module_alloc() allocates shadow memory for module and frees it on module unloading. It doesn't store the pointer to allocated shadow memory because it could be calculated from the shadowed address, i.e. kasan_mem_to_shadow(addr). Since kmemleak cannot find pointer to allocated shadow, it thinks that memory leaked. Use kmemleak_ignore() to tell kmemleak that this is not a leak and shadow memory doesn't contain any pointers. Signed-off-by: Andrey Ryabinin Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index d41b21bce6a0..bc0a8d8b8f42 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -444,6 +445,7 @@ int kasan_module_alloc(void *addr, size_t size) if (ret) { find_vm_area(addr)->flags |= VM_KASAN; + kmemleak_ignore(ret); return 0; } -- cgit v1.2.3 From a380a3c75529a5c42b78c0d64a46404f8cb0c0d1 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 20 Nov 2015 15:57:35 -0800 Subject: slub: create new ___slab_alloc function that can be called with irqs disabled Bulk alloc needs a function like that because it enables interrupts before calling __slab_alloc which promptly disables them again using the expensive local_irq_save(). Signed-off-by: Christoph Lameter Cc: Jesper Dangaard Brouer Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Alexander Duyck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 7cb4bf9ae320..2a952751bb50 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2295,23 +2295,15 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) * And if we were unable to get a new slab from the partial slab lists then * we need to allocate a new slab. This is the slowest path since it involves * a call to the page allocator and the setup of a new slab. + * + * Version of __slab_alloc to use when we know that interrupts are + * already disabled (which is the case for bulk allocation). */ -static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, +static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c) { void *freelist; struct page *page; - unsigned long flags; - - local_irq_save(flags); -#ifdef CONFIG_PREEMPT - /* - * We may have been preempted and rescheduled on a different - * cpu before disabling interrupts. Need to reload cpu area - * pointer. - */ - c = this_cpu_ptr(s->cpu_slab); -#endif page = c->page; if (!page) @@ -2369,7 +2361,6 @@ load_freelist: VM_BUG_ON(!c->page->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); - local_irq_restore(flags); return freelist; new_slab: @@ -2386,7 +2377,6 @@ new_slab: if (unlikely(!freelist)) { slab_out_of_memory(s, gfpflags, node); - local_irq_restore(flags); return NULL; } @@ -2402,10 +2392,34 @@ new_slab: deactivate_slab(s, page, get_freepointer(s, freelist)); c->page = NULL; c->freelist = NULL; - local_irq_restore(flags); return freelist; } +/* + * Another one that disabled interrupt and compensates for possible + * cpu changes by refetching the per cpu area pointer. + */ +static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c) +{ + void *p; + unsigned long flags; + + local_irq_save(flags); +#ifdef CONFIG_PREEMPT + /* + * We may have been preempted and rescheduled on a different + * cpu before disabling interrupts. Need to reload cpu area + * pointer. + */ + c = this_cpu_ptr(s->cpu_slab); +#endif + + p = ___slab_alloc(s, gfpflags, node, addr, c); + local_irq_restore(flags); + return p; +} + /* * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) * have the fastpath folded into their functions. So no function call -- cgit v1.2.3 From 87098373e244840e00bd1c93884c1d917411597e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 20 Nov 2015 15:57:38 -0800 Subject: slub: avoid irqoff/on in bulk allocation Use the new function that can do allocation while interrupts are disabled. Avoids irq on/off sequences. Signed-off-by: Christoph Lameter Cc: Jesper Dangaard Brouer Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Alexander Duyck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 2a952751bb50..23f9d8d26422 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2818,30 +2818,23 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void *object = c->freelist; if (unlikely(!object)) { - local_irq_enable(); /* * Invoking slow path likely have side-effect * of re-populating per CPU c->freelist */ - p[i] = __slab_alloc(s, flags, NUMA_NO_NODE, + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_, c); - if (unlikely(!p[i])) { - __kmem_cache_free_bulk(s, i, p); - return false; - } - local_irq_disable(); + if (unlikely(!p[i])) + goto error; + c = this_cpu_ptr(s->cpu_slab); continue; /* goto for-loop */ } /* kmem_cache debug support */ s = slab_pre_alloc_hook(s, flags); - if (unlikely(!s)) { - __kmem_cache_free_bulk(s, i, p); - c->tid = next_tid(c->tid); - local_irq_enable(); - return false; - } + if (unlikely(!s)) + goto error; c->freelist = get_freepointer(s, object); p[i] = object; @@ -2861,6 +2854,11 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, } return true; + +error: + __kmem_cache_free_bulk(s, i, p); + local_irq_enable(); + return false; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); -- cgit v1.2.3 From b4a64718797b84b64a6ddf3d4183c29c2e79ef1d Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 20 Nov 2015 15:57:41 -0800 Subject: slub: mark the dangling ifdef #else of CONFIG_SLUB_DEBUG The #ifdef of CONFIG_SLUB_DEBUG is located very far from the associated #else. For readability mark it with a comment. Signed-off-by: Jesper Dangaard Brouer Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Alexander Duyck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 23f9d8d26422..a0c1365f6426 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1204,7 +1204,7 @@ unsigned long kmem_cache_flags(unsigned long object_size, return flags; } -#else +#else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} -- cgit v1.2.3 From 81084651d73737988355f167065fab8a73574db1 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 20 Nov 2015 15:57:46 -0800 Subject: slub: support for bulk free with SLUB freelists Make it possible to free a freelist with several objects by adjusting API of slab_free() and __slab_free() to have head, tail and an objects counter (cnt). Tail being NULL indicate single object free of head object. This allow compiler inline constant propagation in slab_free() and slab_free_freelist_hook() to avoid adding any overhead in case of single object free. This allows a freelist with several objects (all within the same slab-page) to be free'ed using a single locked cmpxchg_double in __slab_free() and with an unlocked cmpxchg_double in slab_free(). Object debugging on the free path is also extended to handle these freelists. When CONFIG_SLUB_DEBUG is enabled it will also detect if objects don't belong to the same slab-page. These changes are needed for the next patch to bulk free the detached freelists it introduces and constructs. Micro benchmarking showed no performance reduction due to this change, when debugging is turned off (compiled with CONFIG_SLUB_DEBUG). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexander Duyck Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index a0c1365f6426..d52f0d0ab712 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1065,11 +1065,15 @@ bad: return 0; } +/* Supports checking bulk free of a constructed freelist */ static noinline struct kmem_cache_node *free_debug_processing( - struct kmem_cache *s, struct page *page, void *object, + struct kmem_cache *s, struct page *page, + void *head, void *tail, int bulk_cnt, unsigned long addr, unsigned long *flags) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + void *object = head; + int cnt = 0; spin_lock_irqsave(&n->list_lock, *flags); slab_lock(page); @@ -1077,6 +1081,9 @@ static noinline struct kmem_cache_node *free_debug_processing( if (!check_slab(s, page)) goto fail; +next_object: + cnt++; + if (!check_valid_pointer(s, page, object)) { slab_err(s, page, "Invalid object pointer 0x%p", object); goto fail; @@ -1107,8 +1114,19 @@ static noinline struct kmem_cache_node *free_debug_processing( if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); + /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ init_object(s, object, SLUB_RED_INACTIVE); + + /* Reached end of constructed freelist yet? */ + if (object != tail) { + object = get_freepointer(s, object); + goto next_object; + } out: + if (cnt != bulk_cnt) + slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", + bulk_cnt, cnt); + slab_unlock(page); /* * Keep node_lock to preserve integrity @@ -1212,7 +1230,8 @@ static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } static inline struct kmem_cache_node *free_debug_processing( - struct kmem_cache *s, struct page *page, void *object, + struct kmem_cache *s, struct page *page, + void *head, void *tail, int bulk_cnt, unsigned long addr, unsigned long *flags) { return NULL; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) @@ -1308,6 +1327,29 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) kasan_slab_free(s, x); } +static inline void slab_free_freelist_hook(struct kmem_cache *s, + void *head, void *tail) +{ +/* + * Compiler cannot detect this function can be removed if slab_free_hook() + * evaluates to nothing. Thus, catch all relevant config debug options here. + */ +#if defined(CONFIG_KMEMCHECK) || \ + defined(CONFIG_LOCKDEP) || \ + defined(CONFIG_DEBUG_KMEMLEAK) || \ + defined(CONFIG_DEBUG_OBJECTS_FREE) || \ + defined(CONFIG_KASAN) + + void *object = head; + void *tail_obj = tail ? : head; + + do { + slab_free_hook(s, object); + } while ((object != tail_obj) && + (object = get_freepointer(s, object))); +#endif +} + static void setup_object(struct kmem_cache *s, struct page *page, void *object) { @@ -2583,10 +2625,11 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace); * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, unsigned long addr) + void *head, void *tail, int cnt, + unsigned long addr) + { void *prior; - void **object = (void *)x; int was_frozen; struct page new; unsigned long counters; @@ -2596,7 +2639,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_SLOWPATH); if (kmem_cache_debug(s) && - !(n = free_debug_processing(s, page, x, addr, &flags))) + !(n = free_debug_processing(s, page, head, tail, cnt, + addr, &flags))) return; do { @@ -2606,10 +2650,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page, } prior = page->freelist; counters = page->counters; - set_freepointer(s, object, prior); + set_freepointer(s, tail, prior); new.counters = counters; was_frozen = new.frozen; - new.inuse--; + new.inuse -= cnt; if ((!new.inuse || !prior) && !was_frozen) { if (kmem_cache_has_cpu_partial(s) && !prior) { @@ -2640,7 +2684,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, } while (!cmpxchg_double_slab(s, page, prior, counters, - object, new.counters, + head, new.counters, "__slab_free")); if (likely(!n)) { @@ -2705,15 +2749,20 @@ slab_empty: * * If fastpath is not possible then fall back to __slab_free where we deal * with all sorts of special processing. + * + * Bulk free of a freelist with several objects (all pointing to the + * same page) possible by specifying head and tail ptr, plus objects + * count (cnt). Bulk free indicated by tail pointer being set. */ -static __always_inline void slab_free(struct kmem_cache *s, - struct page *page, void *x, unsigned long addr) +static __always_inline void slab_free(struct kmem_cache *s, struct page *page, + void *head, void *tail, int cnt, + unsigned long addr) { - void **object = (void *)x; + void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; - slab_free_hook(s, x); + slab_free_freelist_hook(s, head, tail); redo: /* @@ -2732,19 +2781,19 @@ redo: barrier(); if (likely(page == c->page)) { - set_freepointer(s, object, c->freelist); + set_freepointer(s, tail_obj, c->freelist); if (unlikely(!this_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, c->freelist, tid, - object, next_tid(tid)))) { + head, next_tid(tid)))) { note_cmpxchg_failure("slab_free", s, tid); goto redo; } stat(s, FREE_FASTPATH); } else - __slab_free(s, page, x, addr); + __slab_free(s, page, head, tail_obj, cnt, addr); } @@ -2753,7 +2802,7 @@ void kmem_cache_free(struct kmem_cache *s, void *x) s = cache_from_obj(s, x); if (!s) return; - slab_free(s, virt_to_head_page(x), x, _RET_IP_); + slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_); trace_kmem_cache_free(_RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); @@ -2788,7 +2837,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) c->tid = next_tid(c->tid); local_irq_enable(); /* Slowpath: overhead locked cmpxchg_double_slab */ - __slab_free(s, page, object, _RET_IP_); + __slab_free(s, page, object, object, 1, _RET_IP_); local_irq_disable(); c = this_cpu_ptr(s->cpu_slab); } @@ -3523,7 +3572,7 @@ void kfree(const void *x) __free_kmem_pages(page, compound_order(page)); return; } - slab_free(page->slab_cache, page, object, _RET_IP_); + slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); } EXPORT_SYMBOL(kfree); -- cgit v1.2.3 From d0ecd894e3d5f768a84403b34019c4a7daa05882 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 20 Nov 2015 15:57:49 -0800 Subject: slub: optimize bulk slowpath free by detached freelist This change focus on improving the speed of object freeing in the "slowpath" of kmem_cache_free_bulk. The calls slab_free (fastpath) and __slab_free (slowpath) have been extended with support for bulk free, which amortize the overhead of the (locked) cmpxchg_double. To use the new bulking feature, we build what I call a detached freelist. The detached freelist takes advantage of three properties: 1) the free function call owns the object that is about to be freed, thus writing into this memory is synchronization-free. 2) many freelist's can co-exist side-by-side in the same slab-page each with a separate head pointer. 3) it is the visibility of the head pointer that needs synchronization. Given these properties, the brilliant part is that the detached freelist can be constructed without any need for synchronization. The freelist is constructed directly in the page objects, without any synchronization needed. The detached freelist is allocated on the stack of the function call kmem_cache_free_bulk. Thus, the freelist head pointer is not visible to other CPUs. All objects in a SLUB freelist must belong to the same slab-page. Thus, constructing the detached freelist is about matching objects that belong to the same slab-page. The bulk free array is scanned is a progressive manor with a limited look-ahead facility. Kmem debug support is handled in call of slab_free(). Notice kmem_cache_free_bulk no longer need to disable IRQs. This only slowed down single free bulk with approx 3 cycles. Performance data: Benchmarked[1] obj size 256 bytes on CPU i7-4790K @ 4.00GHz SLUB fastpath single object quick reuse: 47 cycles(tsc) 11.931 ns To get stable and comparable numbers, the kernel have been booted with "slab_merge" (this also improve performance for larger bulk sizes). Performance data, compared against fallback bulking: bulk - fallback bulk - improvement with this patch 1 - 62 cycles(tsc) 15.662 ns - 49 cycles(tsc) 12.407 ns- improved 21.0% 2 - 55 cycles(tsc) 13.935 ns - 30 cycles(tsc) 7.506 ns - improved 45.5% 3 - 53 cycles(tsc) 13.341 ns - 23 cycles(tsc) 5.865 ns - improved 56.6% 4 - 52 cycles(tsc) 13.081 ns - 20 cycles(tsc) 5.048 ns - improved 61.5% 8 - 50 cycles(tsc) 12.627 ns - 18 cycles(tsc) 4.659 ns - improved 64.0% 16 - 49 cycles(tsc) 12.412 ns - 17 cycles(tsc) 4.495 ns - improved 65.3% 30 - 49 cycles(tsc) 12.484 ns - 18 cycles(tsc) 4.533 ns - improved 63.3% 32 - 50 cycles(tsc) 12.627 ns - 18 cycles(tsc) 4.707 ns - improved 64.0% 34 - 96 cycles(tsc) 24.243 ns - 23 cycles(tsc) 5.976 ns - improved 76.0% 48 - 83 cycles(tsc) 20.818 ns - 21 cycles(tsc) 5.329 ns - improved 74.7% 64 - 74 cycles(tsc) 18.700 ns - 20 cycles(tsc) 5.127 ns - improved 73.0% 128 - 90 cycles(tsc) 22.734 ns - 27 cycles(tsc) 6.833 ns - improved 70.0% 158 - 99 cycles(tsc) 24.776 ns - 30 cycles(tsc) 7.583 ns - improved 69.7% 250 - 104 cycles(tsc) 26.089 ns - 37 cycles(tsc) 9.280 ns - improved 64.4% Performance data, compared current in-kernel bulking: bulk - curr in-kernel - improvement with this patch 1 - 46 cycles(tsc) - 49 cycles(tsc) - improved (cycles:-3) -6.5% 2 - 27 cycles(tsc) - 30 cycles(tsc) - improved (cycles:-3) -11.1% 3 - 21 cycles(tsc) - 23 cycles(tsc) - improved (cycles:-2) -9.5% 4 - 18 cycles(tsc) - 20 cycles(tsc) - improved (cycles:-2) -11.1% 8 - 17 cycles(tsc) - 18 cycles(tsc) - improved (cycles:-1) -5.9% 16 - 18 cycles(tsc) - 17 cycles(tsc) - improved (cycles: 1) 5.6% 30 - 18 cycles(tsc) - 18 cycles(tsc) - improved (cycles: 0) 0.0% 32 - 18 cycles(tsc) - 18 cycles(tsc) - improved (cycles: 0) 0.0% 34 - 78 cycles(tsc) - 23 cycles(tsc) - improved (cycles:55) 70.5% 48 - 60 cycles(tsc) - 21 cycles(tsc) - improved (cycles:39) 65.0% 64 - 49 cycles(tsc) - 20 cycles(tsc) - improved (cycles:29) 59.2% 128 - 69 cycles(tsc) - 27 cycles(tsc) - improved (cycles:42) 60.9% 158 - 79 cycles(tsc) - 30 cycles(tsc) - improved (cycles:49) 62.0% 250 - 86 cycles(tsc) - 37 cycles(tsc) - improved (cycles:49) 57.0% Performance with normal SLUB merging is significantly slower for larger bulking. This is believed to (primarily) be an effect of not having to share the per-CPU data-structures, as tuning per-CPU size can achieve similar performance. bulk - slab_nomerge - normal SLUB merge 1 - 49 cycles(tsc) - 49 cycles(tsc) - merge slower with cycles:0 2 - 30 cycles(tsc) - 30 cycles(tsc) - merge slower with cycles:0 3 - 23 cycles(tsc) - 23 cycles(tsc) - merge slower with cycles:0 4 - 20 cycles(tsc) - 20 cycles(tsc) - merge slower with cycles:0 8 - 18 cycles(tsc) - 18 cycles(tsc) - merge slower with cycles:0 16 - 17 cycles(tsc) - 17 cycles(tsc) - merge slower with cycles:0 30 - 18 cycles(tsc) - 23 cycles(tsc) - merge slower with cycles:5 32 - 18 cycles(tsc) - 22 cycles(tsc) - merge slower with cycles:4 34 - 23 cycles(tsc) - 22 cycles(tsc) - merge slower with cycles:-1 48 - 21 cycles(tsc) - 22 cycles(tsc) - merge slower with cycles:1 64 - 20 cycles(tsc) - 48 cycles(tsc) - merge slower with cycles:28 128 - 27 cycles(tsc) - 57 cycles(tsc) - merge slower with cycles:30 158 - 30 cycles(tsc) - 59 cycles(tsc) - merge slower with cycles:29 250 - 37 cycles(tsc) - 56 cycles(tsc) - merge slower with cycles:19 Joint work with Alexander Duyck. [1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/mm/slab_bulk_test01.c [akpm@linux-foundation.org: BUG_ON -> WARN_ON;return] Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexander Duyck Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 109 +++++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 79 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index d52f0d0ab712..c17c5202864d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2807,44 +2807,93 @@ void kmem_cache_free(struct kmem_cache *s, void *x) } EXPORT_SYMBOL(kmem_cache_free); -/* Note that interrupts must be enabled when calling this function. */ -void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) -{ - struct kmem_cache_cpu *c; +struct detached_freelist { struct page *page; - int i; + void *tail; + void *freelist; + int cnt; +}; - local_irq_disable(); - c = this_cpu_ptr(s->cpu_slab); +/* + * This function progressively scans the array with free objects (with + * a limited look ahead) and extract objects belonging to the same + * page. It builds a detached freelist directly within the given + * page/objects. This can happen without any need for + * synchronization, because the objects are owned by running process. + * The freelist is build up as a single linked list in the objects. + * The idea is, that this detached freelist can then be bulk + * transferred to the real freelist(s), but only requiring a single + * synchronization primitive. Look ahead in the array is limited due + * to performance reasons. + */ +static int build_detached_freelist(struct kmem_cache *s, size_t size, + void **p, struct detached_freelist *df) +{ + size_t first_skipped_index = 0; + int lookahead = 3; + void *object; - for (i = 0; i < size; i++) { - void *object = p[i]; + /* Always re-init detached_freelist */ + df->page = NULL; - BUG_ON(!object); - /* kmem cache debug support */ - s = cache_from_obj(s, object); - if (unlikely(!s)) - goto exit; - slab_free_hook(s, object); + do { + object = p[--size]; + } while (!object && size); - page = virt_to_head_page(object); + if (!object) + return 0; - if (c->page == page) { - /* Fastpath: local CPU free */ - set_freepointer(s, object, c->freelist); - c->freelist = object; - } else { - c->tid = next_tid(c->tid); - local_irq_enable(); - /* Slowpath: overhead locked cmpxchg_double_slab */ - __slab_free(s, page, object, object, 1, _RET_IP_); - local_irq_disable(); - c = this_cpu_ptr(s->cpu_slab); + /* Start new detached freelist */ + set_freepointer(s, object, NULL); + df->page = virt_to_head_page(object); + df->tail = object; + df->freelist = object; + p[size] = NULL; /* mark object processed */ + df->cnt = 1; + + while (size) { + object = p[--size]; + if (!object) + continue; /* Skip processed objects */ + + /* df->page is always set at this point */ + if (df->page == virt_to_head_page(object)) { + /* Opportunity build freelist */ + set_freepointer(s, object, df->freelist); + df->freelist = object; + df->cnt++; + p[size] = NULL; /* mark object processed */ + + continue; } + + /* Limit look ahead search */ + if (!--lookahead) + break; + + if (!first_skipped_index) + first_skipped_index = size + 1; } -exit: - c->tid = next_tid(c->tid); - local_irq_enable(); + + return first_skipped_index; +} + + +/* Note that interrupts must be enabled when calling this function. */ +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + if (WARN_ON(!size)) + return; + + do { + struct detached_freelist df; + + size = build_detached_freelist(s, size, p, &df); + if (unlikely(!df.page)) + continue; + + slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_); + } while (likely(size)); } EXPORT_SYMBOL(kmem_cache_free_bulk); -- cgit v1.2.3 From 03ec0ed57ffc77720b811dbb6d44733b58360d9f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 20 Nov 2015 15:57:52 -0800 Subject: slub: fix kmem cgroup bug in kmem_cache_alloc_bulk The call slab_pre_alloc_hook() interacts with kmemgc and is not allowed to be called several times inside the bulk alloc for loop, due to the call to memcg_kmem_get_cache(). This would result in hitting the VM_BUG_ON in __memcg_kmem_get_cache. As suggested by Vladimir Davydov, change slab_post_alloc_hook() to be able to handle an array of objects. A subtle detail is, loop iterator "i" in slab_post_alloc_hook() must have same type (size_t) as size argument. This helps the compiler to easier realize that it can remove the loop, when all debug statements inside loop evaluates to nothing. Note, this is only an issue because the kernel is compiled with GCC option: -fno-strict-overflow In slab_alloc_node() the compiler inlines and optimizes the invocation of slab_post_alloc_hook(s, flags, 1, &object) by removing the loop and access object directly. Signed-off-by: Jesper Dangaard Brouer Reported-by: Vladimir Davydov Suggested-by: Vladimir Davydov Reviewed-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index c17c5202864d..ce1797623391 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1292,14 +1292,21 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, return memcg_kmem_get_cache(s, flags); } -static inline void slab_post_alloc_hook(struct kmem_cache *s, - gfp_t flags, void *object) +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, + size_t size, void **p) { + size_t i; + flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); + for (i = 0; i < size; i++) { + void *object = p[i]; + + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, + s->flags, flags); + kasan_slab_alloc(s, object); + } memcg_kmem_put_cache(s); - kasan_slab_alloc(s, object); } static inline void slab_free_hook(struct kmem_cache *s, void *x) @@ -2475,7 +2482,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, static __always_inline void *slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr) { - void **object; + void *object; struct kmem_cache_cpu *c; struct page *page; unsigned long tid; @@ -2554,7 +2561,7 @@ redo: if (unlikely(gfpflags & __GFP_ZERO) && object) memset(object, 0, s->object_size); - slab_post_alloc_hook(s, gfpflags, object); + slab_post_alloc_hook(s, gfpflags, 1, &object); return object; } @@ -2904,6 +2911,10 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, struct kmem_cache_cpu *c; int i; + /* memcg and kmem_cache debug support */ + s = slab_pre_alloc_hook(s, flags); + if (unlikely(!s)) + return false; /* * Drain objects in the per cpu slab, while disabling local * IRQs, which protects against PREEMPT and interrupts @@ -2928,17 +2939,8 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, c = this_cpu_ptr(s->cpu_slab); continue; /* goto for-loop */ } - - /* kmem_cache debug support */ - s = slab_pre_alloc_hook(s, flags); - if (unlikely(!s)) - goto error; - c->freelist = get_freepointer(s, object); p[i] = object; - - /* kmem_cache debug support */ - slab_post_alloc_hook(s, flags, object); } c->tid = next_tid(c->tid); local_irq_enable(); @@ -2951,11 +2953,13 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, memset(p[j], 0, s->object_size); } + /* memcg and kmem_cache debug support */ + slab_post_alloc_hook(s, flags, size, p); return true; - error: - __kmem_cache_free_bulk(s, i, p); local_irq_enable(); + slab_post_alloc_hook(s, flags, i, p); + __kmem_cache_free_bulk(s, i, p); return false; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); -- cgit v1.2.3 From 033745189b1bae3fc931beeaf48604ee7c259309 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 20 Nov 2015 15:57:55 -0800 Subject: slub: add missing kmem cgroup support to kmem_cache_free_bulk Initial implementation missed support for kmem cgroup support in kmem_cache_free_bulk() call, add this. If CONFIG_MEMCG_KMEM is not enabled, the compiler should be smart enough to not add any asm code. Incoming bulk free objects can belong to different kmem cgroups, and object free call can happen at a later point outside memcg context. Thus, we need to keep the orig kmem_cache, to correctly verify if a memcg object match against its "root_cache" (s->memcg_params.root_cache). Signed-off-by: Jesper Dangaard Brouer Reviewed-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index ce1797623391..34847044dfe5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2887,13 +2887,17 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, /* Note that interrupts must be enabled when calling this function. */ -void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) { if (WARN_ON(!size)) return; do { struct detached_freelist df; + struct kmem_cache *s; + + /* Support for memcg */ + s = cache_from_obj(orig_s, p[size - 1]); size = build_detached_freelist(s, size, p, &df); if (unlikely(!df.page)) -- cgit v1.2.3 From 865762a8119e74b5f0e236d2d8eaaf8be9292a06 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 20 Nov 2015 15:57:58 -0800 Subject: slab/slub: adjust kmem_cache_alloc_bulk API Adjust kmem_cache_alloc_bulk API before we have any real users. Adjust API to return type 'int' instead of previously type 'bool'. This is done to allow future extension of the bulk alloc API. A future extension could be to allow SLUB to stop at a page boundary, when specified by a flag, and then return the number of objects. The advantage of this approach, would make it easier to make bulk alloc run without local IRQs disabled. With an approach of cmpxchg "stealing" the entire c->freelist or page->freelist. To avoid overshooting we would stop processing at a slab-page boundary. Else we always end up returning some objects at the cost of another cmpxchg. To keep compatible with future users of this API linking against an older kernel when using the new flag, we need to return the number of allocated objects with this API change. Signed-off-by: Jesper Dangaard Brouer Cc: Vladimir Davydov Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- mm/slab.h | 2 +- mm/slab_common.c | 6 +++--- mm/slob.c | 2 +- mm/slub.c | 8 ++++---- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index e0819fa96559..4765c97ce690 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3419,7 +3419,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); -bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { return __kmem_cache_alloc_bulk(s, flags, size, p); diff --git a/mm/slab.h b/mm/slab.h index 27492eb678f7..7b6087197997 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -170,7 +170,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, * may be allocated or freed using these operations. */ void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); -bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); +int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); #ifdef CONFIG_MEMCG_KMEM /* diff --git a/mm/slab_common.c b/mm/slab_common.c index d88e97c10a2e..3c6a86b4ec25 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -112,7 +112,7 @@ void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) kmem_cache_free(s, p[i]); } -bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, +int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, void **p) { size_t i; @@ -121,10 +121,10 @@ bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, void *x = p[i] = kmem_cache_alloc(s, flags); if (!x) { __kmem_cache_free_bulk(s, i, p); - return false; + return 0; } } - return true; + return i; } #ifdef CONFIG_MEMCG_KMEM diff --git a/mm/slob.c b/mm/slob.c index 0d7e5df74d1f..17e8f8cc7c53 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -617,7 +617,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); -bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { return __kmem_cache_alloc_bulk(s, flags, size, p); diff --git a/mm/slub.c b/mm/slub.c index 34847044dfe5..46997517406e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2909,8 +2909,8 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) EXPORT_SYMBOL(kmem_cache_free_bulk); /* Note that interrupts must be enabled when calling this function. */ -bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) { struct kmem_cache_cpu *c; int i; @@ -2959,12 +2959,12 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, /* memcg and kmem_cache debug support */ slab_post_alloc_hook(s, flags, size, p); - return true; + return i; error: local_irq_enable(); slab_post_alloc_hook(s, flags, i, p); __kmem_cache_free_bulk(s, i, p); - return false; + return 0; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); -- cgit v1.2.3