diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 3 | ||||
-rw-r--r-- | mm/hugetlb.c | 2 | ||||
-rw-r--r-- | mm/kmemleak.c | 188 | ||||
-rw-r--r-- | mm/maccess.c | 11 | ||||
-rw-r--r-- | mm/memory-failure.c | 9 | ||||
-rw-r--r-- | mm/mmap.c | 40 | ||||
-rw-r--r-- | mm/nommu.c | 42 | ||||
-rw-r--r-- | mm/page_alloc.c | 72 | ||||
-rw-r--r-- | mm/percpu.c | 4 | ||||
-rw-r--r-- | mm/readahead.c | 12 | ||||
-rw-r--r-- | mm/slab.c | 16 | ||||
-rw-r--r-- | mm/util.c | 44 |
12 files changed, 282 insertions, 161 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 43ea8c3a2bbf..17b8947aa7da 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -221,6 +221,7 @@ config KSM config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" + depends on MMU default 4096 help This is the portion of low virtual memory which should be protected @@ -252,7 +253,7 @@ config MEMORY_FAILURE config HWPOISON_INJECT tristate "HWPoison pages injector" - depends on MEMORY_FAILURE && DEBUG_KERNEL + depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS select PROC_PAGE_MONITOR config NOMMU_INITIAL_TRIM_EXCESS diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 65f38c218207..e91b81b63670 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -402,7 +402,7 @@ static void clear_huge_page(struct page *page, { int i; - if (unlikely(sz > MAX_ORDER_NR_PAGES)) { + if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { clear_gigantic_page(page, addr, sz); return; } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 13f33b3081ec..5b069e4f5e48 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -93,6 +93,7 @@ #include <linux/nodemask.h> #include <linux/mm.h> #include <linux/workqueue.h> +#include <linux/crc32.h> #include <asm/sections.h> #include <asm/processor.h> @@ -108,7 +109,6 @@ #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ #define SECS_FIRST_SCAN 60 /* delay before the first scan */ #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ -#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ #define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ #define BYTES_PER_POINTER sizeof(void *) @@ -119,8 +119,8 @@ /* scanning area inside a memory block */ struct kmemleak_scan_area { struct hlist_node node; - unsigned long offset; - size_t length; + unsigned long start; + size_t size; }; #define KMEMLEAK_GREY 0 @@ -149,6 +149,8 @@ struct kmemleak_object { int min_count; /* the total number of pointers found pointing to this object */ int count; + /* checksum for detecting modified objects */ + u32 checksum; /* memory ranges to be scanned inside an object (empty for all) */ struct hlist_head area_list; unsigned long trace[MAX_TRACE]; @@ -164,8 +166,6 @@ struct kmemleak_object { #define OBJECT_REPORTED (1 << 1) /* flag set to not scan the object */ #define OBJECT_NO_SCAN (1 << 2) -/* flag set on newly allocated objects */ -#define OBJECT_NEW (1 << 3) /* number of bytes to print per line; must be 16 or 32 */ #define HEX_ROW_SIZE 16 @@ -241,8 +241,6 @@ struct early_log { const void *ptr; /* allocated/freed memory block */ size_t size; /* memory block size */ int min_count; /* minimum reference count */ - unsigned long offset; /* scan area offset */ - size_t length; /* scan area length */ unsigned long trace[MAX_TRACE]; /* stack trace */ unsigned int trace_len; /* stack trace length */ }; @@ -323,11 +321,6 @@ static bool color_gray(const struct kmemleak_object *object) object->count >= object->min_count; } -static bool color_black(const struct kmemleak_object *object) -{ - return object->min_count == KMEMLEAK_BLACK; -} - /* * Objects are considered unreferenced only if their color is white, they have * not be deleted and have a minimum age to avoid false positives caused by @@ -335,7 +328,7 @@ static bool color_black(const struct kmemleak_object *object) */ static bool unreferenced_object(struct kmemleak_object *object) { - return (object->flags & OBJECT_ALLOCATED) && color_white(object) && + return (color_white(object) && object->flags & OBJECT_ALLOCATED) && time_before_eq(object->jiffies + jiffies_min_age, jiffies_last_scan); } @@ -348,11 +341,13 @@ static void print_unreferenced(struct seq_file *seq, struct kmemleak_object *object) { int i; + unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", object->pointer, object->size); - seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", - object->comm, object->pid, object->jiffies); + seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", + object->comm, object->pid, object->jiffies, + msecs_age / 1000, msecs_age % 1000); hex_dump_object(seq, object); seq_printf(seq, " backtrace:\n"); @@ -381,6 +376,7 @@ static void dump_object_info(struct kmemleak_object *object) pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); pr_notice(" flags = 0x%lx\n", object->flags); + pr_notice(" checksum = %d\n", object->checksum); pr_notice(" backtrace:\n"); print_stack_trace(&trace, 4); } @@ -522,12 +518,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, INIT_HLIST_HEAD(&object->area_list); spin_lock_init(&object->lock); atomic_set(&object->use_count, 1); - object->flags = OBJECT_ALLOCATED | OBJECT_NEW; + object->flags = OBJECT_ALLOCATED; object->pointer = ptr; object->size = size; object->min_count = min_count; - object->count = -1; /* no color initially */ + object->count = 0; /* white color initially */ object->jiffies = jiffies; + object->checksum = 0; /* task information */ if (in_irq()) { @@ -720,14 +717,13 @@ static void make_black_object(unsigned long ptr) * Add a scanning area to the object. If at least one such area is added, * kmemleak will only scan these ranges rather than the whole memory block. */ -static void add_scan_area(unsigned long ptr, unsigned long offset, - size_t length, gfp_t gfp) +static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) { unsigned long flags; struct kmemleak_object *object; struct kmemleak_scan_area *area; - object = find_and_get_object(ptr, 0); + object = find_and_get_object(ptr, 1); if (!object) { kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", ptr); @@ -741,7 +737,7 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, } spin_lock_irqsave(&object->lock, flags); - if (offset + length > object->size) { + if (ptr + size > object->pointer + object->size) { kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); dump_object_info(object); kmem_cache_free(scan_area_cache, area); @@ -749,8 +745,8 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, } INIT_HLIST_NODE(&area->node); - area->offset = offset; - area->length = length; + area->start = ptr; + area->size = size; hlist_add_head(&area->node, &object->area_list); out_unlock: @@ -786,7 +782,7 @@ static void object_no_scan(unsigned long ptr) * processed later once kmemleak is fully initialized. */ static void __init log_early(int op_type, const void *ptr, size_t size, - int min_count, unsigned long offset, size_t length) + int min_count) { unsigned long flags; struct early_log *log; @@ -808,8 +804,6 @@ static void __init log_early(int op_type, const void *ptr, size_t size, log->ptr = ptr; log->size = size; log->min_count = min_count; - log->offset = offset; - log->length = length; if (op_type == KMEMLEAK_ALLOC) log->trace_len = __save_stack_trace(log->trace); crt_early_log++; @@ -858,7 +852,7 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) create_object((unsigned long)ptr, size, min_count, gfp); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); + log_early(KMEMLEAK_ALLOC, ptr, size, min_count); } EXPORT_SYMBOL_GPL(kmemleak_alloc); @@ -873,7 +867,7 @@ void __ref kmemleak_free(const void *ptr) if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) delete_object_full((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); + log_early(KMEMLEAK_FREE, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free); @@ -888,7 +882,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) delete_object_part((unsigned long)ptr, size); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); + log_early(KMEMLEAK_FREE_PART, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_part); @@ -903,7 +897,7 @@ void __ref kmemleak_not_leak(const void *ptr) if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) make_gray_object((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); + log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_not_leak); @@ -919,22 +913,21 @@ void __ref kmemleak_ignore(const void *ptr) if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) make_black_object((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); + log_early(KMEMLEAK_IGNORE, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_ignore); /* * Limit the range to be scanned in an allocated memory block. */ -void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, - size_t length, gfp_t gfp) +void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) { pr_debug("%s(0x%p)\n", __func__, ptr); if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) - add_scan_area((unsigned long)ptr, offset, length, gfp); + add_scan_area((unsigned long)ptr, size, gfp); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); + log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); } EXPORT_SYMBOL(kmemleak_scan_area); @@ -948,11 +941,25 @@ void __ref kmemleak_no_scan(const void *ptr) if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) object_no_scan((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); + log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_no_scan); /* + * Update an object's checksum and return true if it was modified. + */ +static bool update_checksum(struct kmemleak_object *object) +{ + u32 old_csum = object->checksum; + + if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) + return false; + + object->checksum = crc32(0, (void *)object->pointer, object->size); + return object->checksum != old_csum; +} + +/* * Memory scanning is a long process and it needs to be interruptable. This * function checks whether such interrupt condition occured. */ @@ -1031,11 +1038,14 @@ static void scan_block(void *_start, void *_end, * added to the gray_list. */ object->count++; - if (color_gray(object)) + if (color_gray(object)) { list_add_tail(&object->gray_list, &gray_list); - else - put_object(object); + spin_unlock_irqrestore(&object->lock, flags); + continue; + } + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); } } @@ -1075,14 +1085,47 @@ static void scan_object(struct kmemleak_object *object) } } else hlist_for_each_entry(area, elem, &object->area_list, node) - scan_block((void *)(object->pointer + area->offset), - (void *)(object->pointer + area->offset - + area->length), object, 0); + scan_block((void *)area->start, + (void *)(area->start + area->size), + object, 0); out: spin_unlock_irqrestore(&object->lock, flags); } /* + * Scan the objects already referenced (gray objects). More objects will be + * referenced and, if there are no memory leaks, all the objects are scanned. + */ +static void scan_gray_list(void) +{ + struct kmemleak_object *object, *tmp; + + /* + * The list traversal is safe for both tail additions and removals + * from inside the loop. The kmemleak objects cannot be freed from + * outside the loop because their use_count was incremented. + */ + object = list_entry(gray_list.next, typeof(*object), gray_list); + while (&object->gray_list != &gray_list) { + cond_resched(); + + /* may add new objects to the list */ + if (!scan_should_stop()) + scan_object(object); + + tmp = list_entry(object->gray_list.next, typeof(*object), + gray_list); + + /* remove the object from the list and release it */ + list_del(&object->gray_list); + put_object(object); + + object = tmp; + } + WARN_ON(!list_empty(&gray_list)); +} + +/* * Scan data sections and all the referenced memory blocks allocated via the * kernel's standard allocators. This function must be called with the * scan_mutex held. @@ -1090,10 +1133,9 @@ out: static void kmemleak_scan(void) { unsigned long flags; - struct kmemleak_object *object, *tmp; + struct kmemleak_object *object; int i; int new_leaks = 0; - int gray_list_pass = 0; jiffies_last_scan = jiffies; @@ -1114,7 +1156,6 @@ static void kmemleak_scan(void) #endif /* reset the reference count (whiten the object) */ object->count = 0; - object->flags &= ~OBJECT_NEW; if (color_gray(object) && get_object(object)) list_add_tail(&object->gray_list, &gray_list); @@ -1172,62 +1213,36 @@ static void kmemleak_scan(void) /* * Scan the objects already referenced from the sections scanned - * above. More objects will be referenced and, if there are no memory - * leaks, all the objects will be scanned. The list traversal is safe - * for both tail additions and removals from inside the loop. The - * kmemleak objects cannot be freed from outside the loop because their - * use_count was increased. + * above. */ -repeat: - object = list_entry(gray_list.next, typeof(*object), gray_list); - while (&object->gray_list != &gray_list) { - cond_resched(); - - /* may add new objects to the list */ - if (!scan_should_stop()) - scan_object(object); - - tmp = list_entry(object->gray_list.next, typeof(*object), - gray_list); - - /* remove the object from the list and release it */ - list_del(&object->gray_list); - put_object(object); - - object = tmp; - } - - if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES) - goto scan_end; + scan_gray_list(); /* - * Check for new objects allocated during this scanning and add them - * to the gray list. + * Check for new or unreferenced objects modified since the previous + * scan and color them gray until the next scan. */ rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { spin_lock_irqsave(&object->lock, flags); - if ((object->flags & OBJECT_NEW) && !color_black(object) && - get_object(object)) { - object->flags &= ~OBJECT_NEW; + if (color_white(object) && (object->flags & OBJECT_ALLOCATED) + && update_checksum(object) && get_object(object)) { + /* color it gray temporarily */ + object->count = object->min_count; list_add_tail(&object->gray_list, &gray_list); } spin_unlock_irqrestore(&object->lock, flags); } rcu_read_unlock(); - if (!list_empty(&gray_list)) - goto repeat; - -scan_end: - WARN_ON(!list_empty(&gray_list)); + /* + * Re-scan the gray list for modified unreferenced objects. + */ + scan_gray_list(); /* - * If scanning was stopped or new objects were being allocated at a - * higher rate than gray list scanning, do not report any new - * unreferenced objects. + * If scanning was stopped do not report any new unreferenced objects. */ - if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES) + if (scan_should_stop()) return; /* @@ -1642,8 +1657,7 @@ void __init kmemleak_init(void) kmemleak_ignore(log->ptr); break; case KMEMLEAK_SCAN_AREA: - kmemleak_scan_area(log->ptr, log->offset, log->length, - GFP_KERNEL); + kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL); break; case KMEMLEAK_NO_SCAN: kmemleak_no_scan(log->ptr); diff --git a/mm/maccess.c b/mm/maccess.c index 9073695ff25f..4e348dbaecd7 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -14,7 +14,11 @@ * Safely read from address @src to the buffer at @dst. If a kernel fault * happens, handle that and return -EFAULT. */ -long probe_kernel_read(void *dst, void *src, size_t size) + +long __weak probe_kernel_read(void *dst, void *src, size_t size) + __attribute__((alias("__probe_kernel_read"))); + +long __probe_kernel_read(void *dst, void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); @@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ -long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) +long __weak probe_kernel_write(void *dst, void *src, size_t size) + __attribute__((alias("__probe_kernel_write"))); + +long __probe_kernel_write(void *dst, void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6a0466ed5bfd..17299fd4577c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -52,6 +52,8 @@ int sysctl_memory_failure_recovery __read_mostly = 1; atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); +#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) + u32 hwpoison_filter_enable = 0; u32 hwpoison_filter_dev_major = ~0U; u32 hwpoison_filter_dev_minor = ~0U; @@ -164,6 +166,13 @@ int hwpoison_filter(struct page *p) return 0; } +#else +int hwpoison_filter(struct page *p) +{ + return 0; +} +#endif + EXPORT_SYMBOL_GPL(hwpoison_filter); /* diff --git a/mm/mmap.c b/mm/mmap.c index d9c77b2dbe9d..ee2298936fe6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1043,6 +1043,46 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } EXPORT_SYMBOL(do_mmap_pgoff); +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + struct file *file = NULL; + unsigned long retval = -EBADF; + + if (!(flags & MAP_ANONYMOUS)) { + if (unlikely(flags & MAP_HUGETLB)) + return -EINVAL; + file = fget(fd); + if (!file) + goto out; + } else if (flags & MAP_HUGETLB) { + struct user_struct *user = NULL; + /* + * VM_NORESERVE is used because the reservations will be + * taken when vm_ops->mmap() is called + * A dummy user value is used because we are not locking + * memory so no accounting is necessary + */ + len = ALIGN(len, huge_page_size(&default_hstate)); + file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE); + if (IS_ERR(file)) + return PTR_ERR(file); + } + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + + down_write(¤t->mm->mmap_sem); + retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) + fput(file); +out: + return retval; +} + /* * Some shared mappigns will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot diff --git a/mm/nommu.c b/mm/nommu.c index 8687973462bb..17773862619b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Ok, looks good - let it rip. */ + flush_icache_range(mm->brk, brk); return mm->brk = brk; } @@ -1353,10 +1354,14 @@ unsigned long do_mmap_pgoff(struct file *file, share: add_vma_to_mm(current->mm, vma); - up_write(&nommu_region_sem); + /* we flush the region from the icache only when the first executable + * mapping of it is made */ + if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { + flush_icache_range(region->vm_start, region->vm_end); + region->vm_icache_flushed = true; + } - if (prot & PROT_EXEC) - flush_icache_range(result, result + len); + up_write(&nommu_region_sem); kleave(" = %lx", result); return result; @@ -1398,6 +1403,31 @@ error_getting_region: } EXPORT_SYMBOL(do_mmap_pgoff); +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + struct file *file = NULL; + unsigned long retval = -EBADF; + + if (!(flags & MAP_ANONYMOUS)) { + file = fget(fd); + if (!file) + goto out; + } + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + + down_write(¤t->mm->mmap_sem); + retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) + fput(file); +out: + return retval; +} + /* * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. @@ -1891,9 +1921,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in /* only read or write mappings where it is permitted */ if (write && vma->vm_flags & VM_MAYWRITE) - len -= copy_to_user((void *) addr, buf, len); + copy_to_user_page(vma, NULL, addr, + (void *) addr, buf, len); else if (!write && vma->vm_flags & VM_MAYREAD) - len -= copy_from_user(buf, (void *) addr, len); + copy_from_user_page(vma, NULL, addr, + buf, (void *) addr, len); else len = 0; } else { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 74af449b1f1d..4e9f5cc5fb59 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -48,6 +48,7 @@ #include <linux/page_cgroup.h> #include <linux/debugobjects.h> #include <linux/kmemleak.h> +#include <linux/memory.h> #include <trace/events/kmem.h> #include <asm/tlbflush.h> @@ -2401,13 +2402,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write, { char saved_string[NUMA_ZONELIST_ORDER_LEN]; int ret; + static DEFINE_MUTEX(zl_order_mutex); + mutex_lock(&zl_order_mutex); if (write) - strncpy(saved_string, (char*)table->data, - NUMA_ZONELIST_ORDER_LEN); + strcpy(saved_string, (char*)table->data); ret = proc_dostring(table, write, buffer, length, ppos); if (ret) - return ret; + goto out; if (write) { int oldval = user_zonelist_order; if (__parse_numa_zonelist_order((char*)table->data)) { @@ -2420,7 +2422,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write, } else if (oldval != user_zonelist_order) build_all_zonelists(); } - return 0; +out: + mutex_unlock(&zl_order_mutex); + return ret; } @@ -3579,7 +3583,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, * then all holes in the requested range will be accounted for. */ -static unsigned long __meminit __absent_pages_in_range(int nid, +unsigned long __meminit __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { @@ -4108,7 +4112,7 @@ static int __init cmp_node_active_region(const void *a, const void *b) } /* sort the node_map by start_pfn */ -static void __init sort_node_map(void) +void __init sort_node_map(void) { sort(early_node_map, (size_t)nr_nodemap_entries, sizeof(struct node_active_region), @@ -5008,23 +5012,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, int set_migratetype_isolate(struct page *page) { struct zone *zone; - unsigned long flags; + struct page *curr_page; + unsigned long flags, pfn, iter; + unsigned long immobile = 0; + struct memory_isolate_notify arg; + int notifier_ret; int ret = -EBUSY; int zone_idx; zone = page_zone(page); zone_idx = zone_idx(zone); + spin_lock_irqsave(&zone->lock, flags); + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE || + zone_idx == ZONE_MOVABLE) { + ret = 0; + goto out; + } + + pfn = page_to_pfn(page); + arg.start_pfn = pfn; + arg.nr_pages = pageblock_nr_pages; + arg.pages_found = 0; + /* - * In future, more migrate types will be able to be isolation target. + * It may be possible to isolate a pageblock even if the + * migratetype is not MIGRATE_MOVABLE. The memory isolation + * notifier chain is used by balloon drivers to return the + * number of pages in a range that are held by the balloon + * driver to shrink memory. If all the pages are accounted for + * by balloons, are free, or on the LRU, isolation can continue. + * Later, for example, when memory hotplug notifier runs, these + * pages reported as "can be isolated" should be isolated(freed) + * by the balloon driver through the memory notifier chain. */ - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && - zone_idx != ZONE_MOVABLE) + notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); + notifier_ret = notifier_to_errno(notifier_ret); + if (notifier_ret || !arg.pages_found) goto out; - set_pageblock_migratetype(page, MIGRATE_ISOLATE); - move_freepages_block(zone, page, MIGRATE_ISOLATE); - ret = 0; + + for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { + if (!pfn_valid_within(pfn)) + continue; + + curr_page = pfn_to_page(iter); + if (!page_count(curr_page) || PageLRU(curr_page)) + continue; + + immobile++; + } + + if (arg.pages_found == immobile) + ret = 0; + out: + if (!ret) { + set_pageblock_migratetype(page, MIGRATE_ISOLATE); + move_freepages_block(zone, page, MIGRATE_ISOLATE); + } + spin_unlock_irqrestore(&zone->lock, flags); if (!ret) drain_all_pages(); diff --git a/mm/percpu.c b/mm/percpu.c index 442010cc91c6..083e7c91e5f6 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1271,7 +1271,7 @@ static void pcpu_reclaim(struct work_struct *work) */ void free_percpu(void *ptr) { - void *addr = __pcpu_ptr_to_addr(ptr); + void *addr; struct pcpu_chunk *chunk; unsigned long flags; int off; @@ -1279,6 +1279,8 @@ void free_percpu(void *ptr) if (!ptr) return; + addr = __pcpu_ptr_to_addr(ptr); + spin_lock_irqsave(&pcpu_lock, flags); chunk = pcpu_chunk_addr_search(addr); diff --git a/mm/readahead.c b/mm/readahead.c index aa1aa2345235..033bc135a41f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -547,5 +547,17 @@ page_cache_async_readahead(struct address_space *mapping, /* do read-ahead */ ondemand_readahead(mapping, ra, filp, true, offset, req_size); + +#ifdef CONFIG_BLOCK + /* + * Normally the current page is !uptodate and lock_page() will be + * immediately called to implicitly unplug the device. However this + * is not always true for RAID conifgurations, where data arrives + * not strictly in their submission order. In this case we need to + * explicitly kick off the IO. + */ + if (PageUptodate(page)) + blk_run_backing_dev(mapping->backing_dev_info, NULL); +#endif } EXPORT_SYMBOL_GPL(page_cache_async_readahead); diff --git a/mm/slab.c b/mm/slab.c index 3f4822938f46..7451bdacaf18 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -654,7 +654,7 @@ static void init_node_lock_keys(int q) l3 = s->cs_cachep->nodelists[q]; if (!l3 || OFF_SLAB(s->cs_cachep)) - return; + continue; lockdep_set_class(&l3->list_lock, &on_slab_l3_key); alc = l3->alien; /* @@ -665,7 +665,7 @@ static void init_node_lock_keys(int q) * for alloc_alien_cache, */ if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) - return; + continue; for_each_node(r) { if (alc[r]) lockdep_set_class(&alc[r]->lock, @@ -1132,7 +1132,7 @@ static void __cpuinit cpuup_canceled(long cpu) if (nc) free_block(cachep, nc->entry, nc->avail, node); - if (!cpus_empty(*mask)) { + if (!cpumask_empty(mask)) { spin_unlock_irq(&l3->list_lock); goto free_array_cache; } @@ -2275,9 +2275,11 @@ kmem_cache_create (const char *name, size_t size, size_t align, /* * Determine if the slab management is 'on' or 'off' slab. * (bootstrapping cannot cope with offslab caches so don't do - * it too early on.) + * it too early on. Always use on-slab management when + * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) */ - if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) + if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && + !(flags & SLAB_NOLEAKTRACE)) /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). @@ -2596,8 +2598,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, * kmemleak does not treat the ->s_mem pointer as a reference * to the object. Otherwise we will not report the leak. */ - kmemleak_scan_area(slabp, offsetof(struct slab, list), - sizeof(struct list_head), local_flags); + kmemleak_scan_area(&slabp->list, sizeof(struct list_head), + local_flags); if (!slabp) return NULL; } else { diff --git a/mm/util.c b/mm/util.c index b377ce430803..7c35ad95f927 100644 --- a/mm/util.c +++ b/mm/util.c @@ -4,10 +4,6 @@ #include <linux/module.h> #include <linux/err.h> #include <linux/sched.h> -#include <linux/hugetlb.h> -#include <linux/syscalls.h> -#include <linux/mman.h> -#include <linux/file.h> #include <asm/uaccess.h> #define CREATE_TRACE_POINTS @@ -272,46 +268,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, } EXPORT_SYMBOL_GPL(get_user_pages_fast); -SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, - unsigned long, prot, unsigned long, flags, - unsigned long, fd, unsigned long, pgoff) -{ - struct file * file = NULL; - unsigned long retval = -EBADF; - - if (!(flags & MAP_ANONYMOUS)) { - if (unlikely(flags & MAP_HUGETLB)) - return -EINVAL; - file = fget(fd); - if (!file) - goto out; - } else if (flags & MAP_HUGETLB) { - struct user_struct *user = NULL; - /* - * VM_NORESERVE is used because the reservations will be - * taken when vm_ops->mmap() is called - * A dummy user value is used because we are not locking - * memory so no accounting is necessary - */ - len = ALIGN(len, huge_page_size(&default_hstate)); - file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, - &user, HUGETLB_ANONHUGE_INODE); - if (IS_ERR(file)) - return PTR_ERR(file); - } - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return retval; -} - /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); |