From c3cddc4c296c3a1498df7181015cbcea178a0546 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 24 Jun 2015 16:54:45 -0700 Subject: fsnotify: remove obsolete documentation should_send_event is no longer part of struct fsnotify_ops, so remove it. Signed-off-by: Nikolay Borisov Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fsnotify_backend.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 0f313f93c586..65a517dd32f7 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -84,8 +84,6 @@ struct fsnotify_fname; * Each group much define these ops. The fsnotify infrastructure will call * these operations for each relevant group. * - * should_send_event - given a group, inode, and mask this function determines - * if the group is interested in this event. * handle_event - main call for a group to handle an fs event * free_group_priv - called when a group refcnt hits 0 to clean up the private union * freeing_mark - called when a mark is being destroyed for some reason. The group -- cgit v1.2.3 From 5286d20c4eb7b0c29217f8756652609df74f5489 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 24 Jun 2015 16:54:51 -0700 Subject: configfs: unexport/make static config_item_init() config_item_init() is only used in item.c Signed-off-by: Fabian Frederick Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/configfs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/configfs.h b/include/linux/configfs.h index 34025df61829..c9e5c57e4edf 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -71,7 +71,6 @@ static inline char *config_item_name(struct config_item * item) return item->ci_name; } -extern void config_item_init(struct config_item *); extern void config_item_init_type_name(struct config_item *item, const char *name, struct config_item_type *type); -- cgit v1.2.3 From b5242e98c1cb834feb1e84026f09a4796b49eb4d Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 24 Jun 2015 16:55:42 -0700 Subject: smpboot: allow excluding cpus from the smpboot threads This patch series allows the watchdog to run by default only on the housekeeping cores when nohz_full is in effect; this seems to be a good compromise short of turning it off completely (since the nohz_full cores can't tolerate a watchdog). To provide customizability, we add /proc/sys/kernel/watchdog_cpumask so that the set of cores running the watchdog can be tuned to different values after bootup. To implement this customizability, we add a new smpboot_update_cpumask_percpu_thread() API to the smpboot_thread subsystem that lets us park or unpark "unwanted" threads. And now that threads can be parked for long periods of time, we tweak the /proc//stat and /proc//status code so parked threads aren't reported as running, which is otherwise confusing. This patch (of 3): This change allows some cores to be excluded from running the smp_hotplug_thread tasks. The following commit to update kernel/watchdog.c to use this functionality is the motivating example, and more information on the motivation is provided there. A new smp_hotplug_thread field is introduced, "cpumask", which is cpumask field managed by the smpboot subsystem that indicates whether or not the given smp_hotplug_thread should run on that core; the cpumask is checked when deciding whether to unpark the thread. To limit the cpumask to less than cpu_possible, you must call smpboot_update_cpumask_percpu_thread() after registering. Signed-off-by: Chris Metcalf Cc: Don Zickus Cc: Ingo Molnar Cc: Ulrich Obergfell Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smpboot.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index d600afb21926..da3c593f9845 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h @@ -27,6 +27,8 @@ struct smpboot_thread_data; * @pre_unpark: Optional unpark function, called before the thread is * unparked (cpu online). This is not guaranteed to be * called on the target cpu of the thread. Careful! + * @cpumask: Internal state. To update which threads are unparked, + * call smpboot_update_cpumask_percpu_thread(). * @selfparking: Thread is not parked by the park function. * @thread_comm: The base name of the thread */ @@ -41,11 +43,14 @@ struct smp_hotplug_thread { void (*park)(unsigned int cpu); void (*unpark)(unsigned int cpu); void (*pre_unpark)(unsigned int cpu); + cpumask_var_t cpumask; bool selfparking; const char *thread_comm; }; int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread); void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); +int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, + const struct cpumask *); #endif -- cgit v1.2.3 From fe4ba3c34352b7e8068b7f18eb233444aed17011 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 24 Jun 2015 16:55:45 -0700 Subject: watchdog: add watchdog_cpumask sysctl to assist nohz Change the default behavior of watchdog so it only runs on the housekeeping cores when nohz_full is enabled at build and boot time. Allow modifying the set of cores the watchdog is currently running on with a new kernel.watchdog_cpumask sysctl. In the current system, the watchdog subsystem runs a periodic timer that schedules the watchdog kthread to run. However, nohz_full cores are designed to allow userspace application code running on those cores to have 100% access to the CPU. So the watchdog system prevents the nohz_full application code from being able to run the way it wants to, thus the motivation to suppress the watchdog on nohz_full cores, which this patchset provides by default. However, if we disable the watchdog globally, then the housekeeping cores can't benefit from the watchdog functionality. So we allow disabling it only on some cores. See Documentation/lockup-watchdogs.txt for more information. [jhubbard@nvidia.com: fix a watchdog crash in some configurations] Signed-off-by: Chris Metcalf Acked-by: Don Zickus Cc: Ingo Molnar Cc: Ulrich Obergfell Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nmi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 3d46fb4708e0..f94da0e65dea 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -67,6 +67,7 @@ extern int nmi_watchdog_enabled; extern int soft_watchdog_enabled; extern int watchdog_user_enabled; extern int watchdog_thresh; +extern unsigned long *watchdog_cpumask_bits; extern int sysctl_softlockup_all_cpu_backtrace; struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , @@ -77,6 +78,8 @@ extern int proc_soft_watchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int proc_watchdog_thresh(struct ctl_table *, int , void __user *, size_t *, loff_t *); +extern int proc_watchdog_cpumask(struct ctl_table *, int, + void __user *, size_t *, loff_t *); #endif #ifdef CONFIG_HAVE_ACPI_APEI_NMI -- cgit v1.2.3 From 4066c33d0308f87e9a3b0c7fafb9141c0bfbfa77 Mon Sep 17 00:00:00 2001 From: Gavin Guo Date: Wed, 24 Jun 2015 16:55:54 -0700 Subject: mm/slab_common: support the slub_debug boot option on specific object size The slub_debug=PU,kmalloc-xx cannot work because in the create_kmalloc_caches() the s->name is created after the create_kmalloc_cache() is called. The name is NULL in the create_kmalloc_cache() so the kmem_cache_flags() would not set the slub_debug flags to the s->flags. The fix here set up a kmalloc_names string array for the initialization purpose and delete the dynamic name creation of kmalloc_caches. [akpm@linux-foundation.org: s/kmalloc_names/kmalloc_info/, tweak comment text] Signed-off-by: Gavin Guo Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index ffd24c830151..96f0ea506b5c 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -153,8 +153,30 @@ size_t ksize(const void *); #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN) +/* + * The KMALLOC_LOOP_LOW is the definition for the for loop index start number + * to create the kmalloc_caches object in create_kmalloc_caches(). The first + * and the second are 96 and 192. You can see that in the kmalloc_index(), if + * the KMALLOC_MIN_SIZE <= 32, then return 1 (96). If KMALLOC_MIN_SIZE <= 64, + * then return 2 (192). If the KMALLOC_MIN_SIZE is bigger than 64, we don't + * need to initialize 96 and 192. Go directly to start the KMALLOC_SHIFT_LOW. + */ +#if KMALLOC_MIN_SIZE <= 32 +#define KMALLOC_LOOP_LOW 1 +#elif KMALLOC_MIN_SIZE <= 64 +#define KMALLOC_LOOP_LOW 2 +#else +#define KMALLOC_LOOP_LOW KMALLOC_SHIFT_LOW +#endif + #else #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) +/* + * The KMALLOC_MIN_SIZE of slub/slab/slob is 2^3/2^5/2^3. So, even slab is used. + * The KMALLOC_MIN_SIZE <= 32. The kmalloc-96 and kmalloc-192 should also be + * initialized. + */ +#define KMALLOC_LOOP_LOW 1 #endif /* -- cgit v1.2.3 From 1ed58b6051b67e5cfe9e465fb60bf7d5f55e0a64 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 24 Jun 2015 16:55:59 -0700 Subject: linux/slab.h: fix three off-by-one typos in comment The first is a keyboard-off-by-one, the other two the ordinary mathy kind. Signed-off-by: Rasmus Villemoes Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 96f0ea506b5c..9de2fdc8b5e4 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -262,8 +262,8 @@ extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; * belongs to. * 0 = zero alloc * 1 = 65 .. 96 bytes - * 2 = 120 .. 192 bytes - * n = 2^(n-1) .. 2^n -1 + * 2 = 129 .. 192 bytes + * n = 2^(n-1)+1 .. 2^n */ static __always_inline int kmalloc_index(size_t size) { -- cgit v1.2.3 From 2ae416b142b625c58c9ccb039aa3ef48ad0e9bae Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Wed, 24 Jun 2015 16:56:16 -0700 Subject: mm: new mm hook framework CRIU is recreating the process memory layout by remapping the checkpointee memory area on top of the current process (criu). This includes remapping the vDSO to the place it has at checkpoint time. However some architectures like powerpc are keeping a reference to the vDSO base address to build the signal return stack frame by calling the vDSO sigreturn service. So once the vDSO has been moved, this reference is no more valid and the signal frame built later are not usable. This patch serie is introducing a new mm hook framework, and a new arch_remap hook which is called when mremap is done and the mm lock still hold. The next patch is adding the vDSO remap and unmap tracking to the powerpc architecture. This patch (of 3): This patch introduces a new set of header file to manage mm hooks: - per architecture empty header file (arch/x/include/asm/mm-arch-hooks.h) - a generic header (include/linux/mm-arch-hooks.h) The architecture which need to overwrite a hook as to redefine it in its header file, while architecture which doesn't need have nothing to do. The default hooks are defined in the generic header and are used in the case the architecture is not defining it. In a next step, mm hooks defined in include/asm-generic/mm_hooks.h should be moved here. Signed-off-by: Laurent Dufour Suggested-by: Andrew Morton Cc: "Kirill A. Shutemov" Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm-arch-hooks.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 include/linux/mm-arch-hooks.h (limited to 'include/linux') diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h new file mode 100644 index 000000000000..63005e367abd --- /dev/null +++ b/include/linux/mm-arch-hooks.h @@ -0,0 +1,16 @@ +/* + * Generic mm no-op hooks. + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef _LINUX_MM_ARCH_HOOKS_H +#define _LINUX_MM_ARCH_HOOKS_H + +#include + +#endif /* _LINUX_MM_ARCH_HOOKS_H */ -- cgit v1.2.3 From 4abad2ca4a4dbdd4a218c12451231ab628f2e60c Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Wed, 24 Jun 2015 16:56:19 -0700 Subject: mm: new arch_remap() hook Some architectures would like to be triggered when a memory area is moved through the mremap system call. This patch introduces a new arch_remap() mm hook which is placed in the path of mremap, and is called before the old area is unmapped (and the arch_unmap() hook is called). Signed-off-by: Laurent Dufour Cc: "Kirill A. Shutemov" Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm-arch-hooks.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h index 63005e367abd..4efc3f56e6df 100644 --- a/include/linux/mm-arch-hooks.h +++ b/include/linux/mm-arch-hooks.h @@ -13,4 +13,13 @@ #include +#ifndef arch_remap +static inline void arch_remap(struct mm_struct *mm, + unsigned long old_start, unsigned long old_end, + unsigned long new_start, unsigned long new_end) +{ +} +#define arch_remap arch_remap +#endif + #endif /* _LINUX_MM_ARCH_HOOKS_H */ -- cgit v1.2.3 From a9919c79359df9a706ff9e14fc0a93cd15791c9b Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 24 Jun 2015 16:56:28 -0700 Subject: mm: only define hashdist variable when needed For !CONFIG_NUMA, hashdist will always be 0, since it's setter is otherwise compiled out. So we can save 4 bytes of data and some .text (although mostly in __init functions) by only defining it for CONFIG_NUMA. Signed-off-by: Rasmus Villemoes Acked-by: David Rientjes Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 0995c2de8162..f589222bfa87 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -357,12 +357,12 @@ extern void *alloc_large_system_hash(const char *tablename, /* Only NUMA needs hash distribution. 64bit NUMA architectures have * sufficient vmalloc space. */ -#if defined(CONFIG_NUMA) && defined(CONFIG_64BIT) -#define HASHDIST_DEFAULT 1 +#ifdef CONFIG_NUMA +#define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT) +extern int hashdist; /* Distribute hashes across NUMA nodes? */ #else -#define HASHDIST_DEFAULT 0 +#define hashdist (0) #endif -extern int hashdist; /* Distribute hashes across NUMA nodes? */ #endif /* _LINUX_BOOTMEM_H */ -- cgit v1.2.3 From c761471b58e6138938ebc6eafec20b2f60cb3397 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 24 Jun 2015 16:56:33 -0700 Subject: mm: avoid tail page refcounting on non-THP compound pages Reintroduce 8d63d99a5dfb ("mm: avoid tail page refcounting on non-THP compound pages") after removing bogus VM_BUG_ON_PAGE() in put_unrefcounted_compound_page(). THP uses tail page refcounting to be able to split huge pages at any time. Tail page refcounting is not needed for other users of compound pages and it's harmful because of overhead. We try to exclude non-THP pages from tail page refcounting using __compound_tail_refcounted() check. It excludes most common non-THP compound pages: SL*B and hugetlb, but it doesn't catch rest of __GFP_COMP users -- drivers. And it's not only about overhead. Drivers might want to use compound pages to get refcounting semantics suitable for mapping high-order pages to userspace. But tail page refcounting breaks it. Tail page refcounting uses ->_mapcount in tail pages to store GUP pins on them. It means GUP pins would affect page_mapcount() for tail pages. It's not a problem for THP, because it never maps tail pages. But unlike THP, drivers map parts of compound pages with PTEs and it makes page_mapcount() be called for tail pages. In particular, GUP pins would shift PSS up and affect /proc/kpagecount for such pages. But, I'm not aware about anything which can lead to crash or other serious misbehaviour. Since currently all THP pages are anonymous and all drivers pages are not, we can fix the __compound_tail_refcounted() check by requiring PageAnon() to enable tail page refcounting. Signed-off-by: Kirill A. Shutemov Acked-by: Hugh Dickins Reviewed-by: Andrea Arcangeli Reported-by: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0755b9fd03a7..8b086070c3a5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -499,7 +499,7 @@ static inline int page_count(struct page *page) static inline bool __compound_tail_refcounted(struct page *page) { - return !PageSlab(page) && !PageHeadHuge(page); + return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page); } /* -- cgit v1.2.3 From ead07f6a867b5b1b41cf703735e8b39094987a7d Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 24 Jun 2015 16:56:48 -0700 Subject: mm/memory-failure: introduce get_hwpoison_page() for consistent refcount handling memory_failure() can run in 2 different mode (specified by MF_COUNT_INCREASED) in page refcount perspective. When MF_COUNT_INCREASED is set, memory_failure() assumes that the caller takes a refcount of the target page. And if cleared, memory_failure() takes it in it's own. In current code, however, refcounting is done differently in each caller. For example, madvise_hwpoison() uses get_user_pages_fast() and hwpoison_inject() uses get_page_unless_zero(). So this inconsistent refcounting causes refcount failure especially for thp tail pages. Typical user visible effects are like memory leak or VM_BUG_ON_PAGE(!page_count(page)) in isolate_lru_page(). To fix this refcounting issue, this patch introduces get_hwpoison_page() to handle thp tail pages in the same manner for each caller of hwpoison code. memory_failure() might fail to split thp and in such case it returns without completing page isolation. This is not good because PageHWPoison on the thp is still set and there's no easy way to unpoison such thps. So this patch try to roll back any action to the thp in "non anonymous thp" case and "thp split failed" case, expecting an MCE(SRAR) generated by later access afterward will properly free such thps. [akpm@linux-foundation.org: fix CONFIG_HWPOISON_INJECT=m] Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b086070c3a5..cd9df66138a1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2146,6 +2146,7 @@ enum mf_flags { extern int memory_failure(unsigned long pfn, int trapno, int flags); extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); extern int unpoison_memory(unsigned long pfn); +extern int get_hwpoison_page(struct page *page); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); -- cgit v1.2.3 From 16e951966f05da5ccd650104176f6ba289f7fa20 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 24 Jun 2015 16:57:07 -0700 Subject: mm: oom_kill: clean up victim marking and exiting interfaces Rename unmark_oom_victim() to exit_oom_victim(). Marking and unmarking are related in functionality, but the interface is not symmetrical at all: one is an internal OOM killer function used during the killing, the other is for an OOM victim to signal its own death on exit later on. This has locking implications, see follow-up changes. While at it, rename mark_tsk_oom_victim() to mark_oom_victim(), which is easier on the eye. Signed-off-by: Johannes Weiner Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Tetsuo Handa Cc: Andrea Arcangeli Cc: Dave Chinner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/oom.h b/include/linux/oom.h index 44b2f6f7bbd8..a8e6a498cbcb 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -47,9 +47,7 @@ static inline bool oom_task_origin(const struct task_struct *p) return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN); } -extern void mark_tsk_oom_victim(struct task_struct *tsk); - -extern void unmark_oom_victim(void); +extern void mark_oom_victim(struct task_struct *tsk); extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, @@ -75,6 +73,9 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *mask, bool force_kill); + +extern void exit_oom_victim(void); + extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); -- cgit v1.2.3 From dc56401fc9f25e8f93899991ec858c98a331d88c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 24 Jun 2015 16:57:19 -0700 Subject: mm: oom_kill: simplify OOM killer locking The zonelist locking and the oom_sem are two overlapping locks that are used to serialize global OOM killing against different things. The historical zonelist locking serializes OOM kills from allocations with overlapping zonelists against each other to prevent killing more tasks than necessary in the same memory domain. Only when neither tasklists nor zonelists from two concurrent OOM kills overlap (tasks in separate memcgs bound to separate nodes) are OOM kills allowed to execute in parallel. The younger oom_sem is a read-write lock to serialize OOM killing against the PM code trying to disable the OOM killer altogether. However, the OOM killer is a fairly cold error path, there is really no reason to optimize for highly performant and concurrent OOM kills. And the oom_sem is just flat-out redundant. Replace both locking schemes with a single global mutex serializing OOM kills regardless of context. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Tetsuo Handa Cc: Andrea Arcangeli Cc: Dave Chinner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/oom.h b/include/linux/oom.h index a8e6a498cbcb..7deecb7bca5e 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -32,6 +32,8 @@ enum oom_scan_t { /* Thread is the potential origin of an oom condition; kill first on oom */ #define OOM_FLAG_ORIGIN ((__force oom_flags_t)0x1) +extern struct mutex oom_lock; + static inline void set_current_oom_origin(void) { current->signal->oom_flags |= OOM_FLAG_ORIGIN; @@ -60,9 +62,6 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct mem_cgroup *memcg, nodemask_t *nodemask, const char *message); -extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags); -extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags); - extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, int order, const nodemask_t *nodemask, struct mem_cgroup *memcg); -- cgit v1.2.3 From cc637b1704d78b068c2eb700eec384c69ea56cdf Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Wed, 24 Jun 2015 16:57:30 -0700 Subject: memory-failure: export page_type and action result Export 'outcome' and 'action_page_type' to mm.h, so we could use this emnus outside. This patch is preparation for adding trace events for memory-failure recovery action. Signed-off-by: Xie XiuQi Acked-by: Naoya Horiguchi Cc: Chen Gong Cc: Jim Davis Cc: Steven Rostedt Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index cd9df66138a1..986a9a221ee0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2153,6 +2153,40 @@ extern void shake_page(struct page *p, int access); extern atomic_long_t num_poisoned_pages; extern int soft_offline_page(struct page *page, int flags); + +/* + * Error handlers for various types of pages. + */ +enum mf_outcome { + MF_IGNORED, /* Error: cannot be handled */ + MF_FAILED, /* Error: handling failed */ + MF_DELAYED, /* Will be handled later */ + MF_RECOVERED, /* Successfully recovered */ +}; + +enum mf_action_page_type { + MF_MSG_KERNEL, + MF_MSG_KERNEL_HIGH_ORDER, + MF_MSG_SLAB, + MF_MSG_DIFFERENT_COMPOUND, + MF_MSG_POISONED_HUGE, + MF_MSG_HUGE, + MF_MSG_FREE_HUGE, + MF_MSG_UNMAP_FAILED, + MF_MSG_DIRTY_SWAPCACHE, + MF_MSG_CLEAN_SWAPCACHE, + MF_MSG_DIRTY_MLOCKED_LRU, + MF_MSG_CLEAN_MLOCKED_LRU, + MF_MSG_DIRTY_UNEVICTABLE_LRU, + MF_MSG_CLEAN_UNEVICTABLE_LRU, + MF_MSG_DIRTY_LRU, + MF_MSG_CLEAN_LRU, + MF_MSG_TRUNCATED_LRU, + MF_MSG_BUDDY, + MF_MSG_BUDDY_2ND, + MF_MSG_UNKNOWN, +}; + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) extern void clear_huge_page(struct page *page, unsigned long addr, -- cgit v1.2.3 From cc3e2af42e7b7e0457b93bf17c19b44c635cd40c Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Wed, 24 Jun 2015 16:57:33 -0700 Subject: memory-failure: change type of action_result's param 3 to enum Change type of action_result's param 3 to enum for type consistency, and rename mf_outcome to mf_result for clearly. Signed-off-by: Xie XiuQi Acked-by: Naoya Horiguchi Cc: Chen Gong Cc: Jim Davis Cc: Steven Rostedt Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 986a9a221ee0..24ad583596d1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2157,7 +2157,7 @@ extern int soft_offline_page(struct page *page, int flags); /* * Error handlers for various types of pages. */ -enum mf_outcome { +enum mf_result { MF_IGNORED, /* Error: cannot be handled */ MF_FAILED, /* Error: handling failed */ MF_DELAYED, /* Will be handled later */ -- cgit v1.2.3 From 8809aa2d28d74111ff2f1928edaa4e9845c97a7d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 24 Jun 2015 16:57:44 -0700 Subject: mm: clarify that the function operates on hugepage pte We have confusing functions to clear pmd, pmd_clear_* and pmd_clear. Add _huge_ to pmdp_clear functions so that we are clear that they operate on hugepage pte. We don't bother about other functions like pmdp_set_wrprotect, pmdp_clear_flush_young, because they operate on PTE bits and hence indicate they are operating on hugepage ptes Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Andrea Arcangeli Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 95243d28a0ee..61cd67f4d788 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -324,25 +324,25 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) ___pte; \ }) -#define pmdp_clear_flush_notify(__vma, __haddr, __pmd) \ +#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \ ({ \ unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ struct mm_struct *___mm = (__vma)->vm_mm; \ pmd_t ___pmd; \ \ - ___pmd = pmdp_clear_flush(__vma, __haddr, __pmd); \ + ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \ mmu_notifier_invalidate_range(___mm, ___haddr, \ ___haddr + HPAGE_PMD_SIZE); \ \ ___pmd; \ }) -#define pmdp_get_and_clear_notify(__mm, __haddr, __pmd) \ +#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \ ({ \ unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ pmd_t ___pmd; \ \ - ___pmd = pmdp_get_and_clear(__mm, __haddr, __pmd); \ + ___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd); \ mmu_notifier_invalidate_range(__mm, ___haddr, \ ___haddr + HPAGE_PMD_SIZE); \ \ @@ -428,8 +428,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_flush_notify ptep_clear_flush -#define pmdp_clear_flush_notify pmdp_clear_flush -#define pmdp_get_and_clear_notify pmdp_get_and_clear +#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush +#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear #define set_pte_at_notify set_pte_at #endif /* CONFIG_MMU_NOTIFIER */ -- cgit v1.2.3 From fc6daaf93151877748f8096af6b3fddb147f22d6 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Jun 2015 16:58:09 -0700 Subject: mm/memblock: add extra "flags" to memblock to allow selection of memory based on attribute Some high end Intel Xeon systems report uncorrectable memory errors as a recoverable machine check. Linux has included code for some time to process these and just signal the affected processes (or even recover completely if the error was in a read only page that can be replaced by reading from disk). But we have no recovery path for errors encountered during kernel code execution. Except for some very specific cases were are unlikely to ever be able to recover. Enter memory mirroring. Actually 3rd generation of memory mirroing. Gen1: All memory is mirrored Pro: No s/w enabling - h/w just gets good data from other side of the mirror Con: Halves effective memory capacity available to OS/applications Gen2: Partial memory mirror - just mirror memory begind some memory controllers Pro: Keep more of the capacity Con: Nightmare to enable. Have to choose between allocating from mirrored memory for safety vs. NUMA local memory for performance Gen3: Address range partial memory mirror - some mirror on each memory controller Pro: Can tune the amount of mirror and keep NUMA performance Con: I have to write memory management code to implement The current plan is just to use mirrored memory for kernel allocations. This has been broken into two phases: 1) This patch series - find the mirrored memory, use it for boot time allocations 2) Wade into mm/page_alloc.c and define a ZONE_MIRROR to pick up the unused mirrored memory from mm/memblock.c and only give it out to select kernel allocations (this is still being scoped because page_alloc.c is scary). This patch (of 3): Add extra "flags" to memblock to allow selection of memory based on attribute. No functional changes Signed-off-by: Tony Luck Cc: Xishi Qiu Cc: Hanjun Guo Cc: Xiexiuqi Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 9497ec7c77ea..7aeec0cb4c27 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -21,7 +21,10 @@ #define INIT_PHYSMEM_REGIONS 4 /* Definition of memblock flags. */ -#define MEMBLOCK_HOTPLUG 0x1 /* hotpluggable region */ +enum { + MEMBLOCK_NONE = 0x0, /* No special request */ + MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */ +}; struct memblock_region { phys_addr_t base; @@ -61,7 +64,7 @@ extern bool movable_node_enabled; phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, - int nid); + int nid, ulong flags); phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align); phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); @@ -85,11 +88,13 @@ int memblock_remove_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size); -void __next_mem_range(u64 *idx, int nid, struct memblock_type *type_a, +void __next_mem_range(u64 *idx, int nid, ulong flags, + struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); -void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a, +void __next_mem_range_rev(u64 *idx, int nid, ulong flags, + struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); @@ -100,16 +105,17 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a, * @type_a: ptr to memblock_type to iterate * @type_b: ptr to memblock_type which excludes from the iteration * @nid: node selector, %NUMA_NO_NODE for all nodes + * @flags: pick from blocks based on memory attributes * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL */ -#define for_each_mem_range(i, type_a, type_b, nid, \ +#define for_each_mem_range(i, type_a, type_b, nid, flags, \ p_start, p_end, p_nid) \ - for (i = 0, __next_mem_range(&i, nid, type_a, type_b, \ + for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid); \ i != (u64)ULLONG_MAX; \ - __next_mem_range(&i, nid, type_a, type_b, \ + __next_mem_range(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid)) /** @@ -119,17 +125,18 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a, * @type_a: ptr to memblock_type to iterate * @type_b: ptr to memblock_type which excludes from the iteration * @nid: node selector, %NUMA_NO_NODE for all nodes + * @flags: pick from blocks based on memory attributes * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL */ -#define for_each_mem_range_rev(i, type_a, type_b, nid, \ +#define for_each_mem_range_rev(i, type_a, type_b, nid, flags, \ p_start, p_end, p_nid) \ for (i = (u64)ULLONG_MAX, \ - __next_mem_range_rev(&i, nid, type_a, type_b, \ + __next_mem_range_rev(&i, nid, flags, type_a, type_b,\ p_start, p_end, p_nid); \ i != (u64)ULLONG_MAX; \ - __next_mem_range_rev(&i, nid, type_a, type_b, \ + __next_mem_range_rev(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid)) #ifdef CONFIG_MOVABLE_NODE @@ -181,13 +188,14 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL + * @flags: pick from blocks based on memory attributes * * Walks over free (memory && !reserved) areas of memblock. Available as * soon as memblock is initialized. */ -#define for_each_free_mem_range(i, nid, p_start, p_end, p_nid) \ +#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid) \ for_each_mem_range(i, &memblock.memory, &memblock.reserved, \ - nid, p_start, p_end, p_nid) + nid, flags, p_start, p_end, p_nid) /** * for_each_free_mem_range_reverse - rev-iterate through free memblock areas @@ -196,13 +204,15 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL + * @flags: pick from blocks based on memory attributes * * Walks over free (memory && !reserved) areas of memblock in reverse * order. Available as soon as memblock is initialized. */ -#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \ +#define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end, \ + p_nid) \ for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ - nid, p_start, p_end, p_nid) + nid, flags, p_start, p_end, p_nid) static inline void memblock_set_region_flags(struct memblock_region *r, unsigned long flags) @@ -273,7 +283,8 @@ static inline bool memblock_bottom_up(void) { return false; } #define MEMBLOCK_ALLOC_ACCESSIBLE 0 phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, - phys_addr_t start, phys_addr_t end); + phys_addr_t start, phys_addr_t end, + ulong flags); phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr); phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align, -- cgit v1.2.3 From a3f5bafcc04aaf62990e0cf3ced1cc6d8dc6fe95 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Jun 2015 16:58:12 -0700 Subject: mm/memblock: allocate boot time data structures from mirrored memory Try to allocate all boot time kernel data structures from mirrored memory. If we run out of mirrored memory print warnings, but fall back to using non-mirrored memory to make sure that we still boot. By number of bytes, most of what we allocate at boot time is the page structures. 64 bytes per 4K page on x86_64 ... or about 1.5% of total system memory. For workloads where the bulk of memory is allocated to applications this may represent a useful improvement to system availability since 1.5% of total memory might be a third of the memory allocated to the kernel. Signed-off-by: Tony Luck Cc: Xishi Qiu Cc: Hanjun Guo Cc: Xiexiuqi Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 7aeec0cb4c27..0215ffd63069 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -24,6 +24,7 @@ enum { MEMBLOCK_NONE = 0x0, /* No special request */ MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */ + MEMBLOCK_MIRROR = 0x2, /* mirrored region */ }; struct memblock_region { @@ -78,6 +79,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size); void memblock_trim_memory(phys_addr_t align); int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); +int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); +ulong choose_memblock_flags(void); /* Low level functions */ int memblock_add_range(struct memblock_type *type, @@ -160,6 +163,11 @@ static inline bool movable_node_is_enabled(void) } #endif +static inline bool memblock_is_mirror(struct memblock_region *m) +{ + return m->flags & MEMBLOCK_MIRROR; +} + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); -- cgit v1.2.3 From b05b9f5f9dcf593a0e9327676b78e6c17b4218e8 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Jun 2015 16:58:15 -0700 Subject: x86, mirror: x86 enabling - find mirrored memory ranges UEFI GetMemoryMap() uses a new attribute bit to mark mirrored memory address ranges. See UEFI 2.5 spec pages 157-158: http://www.uefi.org/sites/default/files/resources/UEFI%202_5.pdf On EFI enabled systems scan the memory map and tell memblock about any mirrored ranges. Signed-off-by: Tony Luck Cc: Xishi Qiu Cc: Hanjun Guo Cc: Xiexiuqi Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/efi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 2092965afca3..5f19efe4eb3f 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -96,6 +96,8 @@ typedef struct { #define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */ #define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */ #define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */ +#define EFI_MEMORY_MORE_RELIABLE \ + ((u64)0x0000000000010000ULL) /* higher reliability */ #define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */ #define EFI_MEMORY_DESCRIPTOR_VERSION 1 @@ -868,6 +870,7 @@ extern void efi_enter_virtual_mode (void); /* switch EFI to virtual mode, if pos extern void efi_late_init(void); extern void efi_free_boot_services(void); extern efi_status_t efi_query_variable_store(u32 attributes, unsigned long size); +extern void efi_find_mirror(void); #else static inline void efi_late_init(void) {} static inline void efi_free_boot_services(void) {} -- cgit v1.2.3 From d1dc6f1bcf1e998e7ce65fc120da371ab047a999 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 24 Jun 2015 16:58:18 -0700 Subject: frontswap: allow multiple backends Change frontswap single pointer to a singly linked list of frontswap implementations. Update Xen tmem implementation as register no longer returns anything. Frontswap only keeps track of a single implementation; any implementation that registers second (or later) will replace the previously registered implementation, and gets a pointer to the previous implementation that the new implementation is expected to pass all frontswap functions to if it can't handle the function itself. However that method doesn't really make much sense, as passing that work on to every implementation adds unnecessary work to implementations; instead, frontswap should simply keep a list of all registered implementations and try each implementation for any function. Most importantly, neither of the two currently existing frontswap implementations in the kernel actually do anything with any previous frontswap implementation that they replace when registering. This allows frontswap to successfully manage multiple implementations by keeping a list of them all. Signed-off-by: Dan Streetman Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 8293262401de..e65ef959546c 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -6,16 +6,16 @@ #include struct frontswap_ops { - void (*init)(unsigned); - int (*store)(unsigned, pgoff_t, struct page *); - int (*load)(unsigned, pgoff_t, struct page *); - void (*invalidate_page)(unsigned, pgoff_t); - void (*invalidate_area)(unsigned); + void (*init)(unsigned); /* this swap type was just swapon'ed */ + int (*store)(unsigned, pgoff_t, struct page *); /* store a page */ + int (*load)(unsigned, pgoff_t, struct page *); /* load a page */ + void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */ + void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */ + struct frontswap_ops *next; /* private pointer to next ops */ }; extern bool frontswap_enabled; -extern struct frontswap_ops * - frontswap_register_ops(struct frontswap_ops *ops); +extern void frontswap_register_ops(struct frontswap_ops *ops); extern void frontswap_shrink(unsigned long); extern unsigned long frontswap_curr_pages(void); extern void frontswap_writethrough(bool); -- cgit v1.2.3 From 8a8c35fadfaf55629a37ef1a8ead1b8fb32581d2 Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Wed, 24 Jun 2015 16:58:51 -0700 Subject: mm: kmemleak_alloc_percpu() should follow the gfp from per_alloc() Beginning at commit d52d3997f843 ("ipv6: Create percpu rt6_info"), the following INFO splat is logged: =============================== [ INFO: suspicious RCU usage. ] 4.1.0-rc7-next-20150612 #1 Not tainted ------------------------------- kernel/sched/core.c:7318 Illegal context switch in RCU-bh read-side critical section! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 3 locks held by systemd/1: #0: (rtnl_mutex){+.+.+.}, at: [] rtnetlink_rcv+0x1f/0x40 #1: (rcu_read_lock_bh){......}, at: [] ipv6_add_addr+0x62/0x540 #2: (addrconf_hash_lock){+...+.}, at: [] ipv6_add_addr+0x184/0x540 stack backtrace: CPU: 0 PID: 1 Comm: systemd Not tainted 4.1.0-rc7-next-20150612 #1 Hardware name: TOSHIBA TECRA A50-A/TECRA A50-A, BIOS Version 4.20 04/17/2014 Call Trace: dump_stack+0x4c/0x6e lockdep_rcu_suspicious+0xe7/0x120 ___might_sleep+0x1d5/0x1f0 __might_sleep+0x4d/0x90 kmem_cache_alloc+0x47/0x250 create_object+0x39/0x2e0 kmemleak_alloc_percpu+0x61/0xe0 pcpu_alloc+0x370/0x630 Additional backtrace lines are truncated. In addition, the above splat is followed by several "BUG: sleeping function called from invalid context at mm/slub.c:1268" outputs. As suggested by Martin KaFai Lau, these are the clue to the fix. Routine kmemleak_alloc_percpu() always uses GFP_KERNEL for its allocations, whereas it should follow the gfp from its callers. Reviewed-by: Catalin Marinas Reviewed-by: Kamalesh Babulal Acked-by: Martin KaFai Lau Signed-off-by: Larry Finger Cc: Martin KaFai Lau Cc: Catalin Marinas Cc: Tejun Heo Cc: Christoph Lameter Cc: [3.18+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmemleak.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index e705467ddb47..d0a1f99e24e3 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -28,7 +28,8 @@ extern void kmemleak_init(void) __ref; extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) __ref; -extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) __ref; +extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, + gfp_t gfp) __ref; extern void kmemleak_free(const void *ptr) __ref; extern void kmemleak_free_part(const void *ptr, size_t size) __ref; extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; @@ -71,7 +72,8 @@ static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, gfp_t gfp) { } -static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) +static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, + gfp_t gfp) { } static inline void kmemleak_free(const void *ptr) -- cgit v1.2.3