summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/kmemleak.c188
-rw-r--r--mm/maccess.c11
-rw-r--r--mm/memory-failure.c9
-rw-r--r--mm/mmap.c40
-rw-r--r--mm/nommu.c42
-rw-r--r--mm/page_alloc.c72
-rw-r--r--mm/percpu.c4
-rw-r--r--mm/readahead.c12
-rw-r--r--mm/slab.c16
-rw-r--r--mm/util.c44
12 files changed, 282 insertions, 161 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 43ea8c3a2bbf..17b8947aa7da 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -221,6 +221,7 @@ config KSM
config DEFAULT_MMAP_MIN_ADDR
int "Low address space to protect from user allocation"
+ depends on MMU
default 4096
help
This is the portion of low virtual memory which should be protected
@@ -252,7 +253,7 @@ config MEMORY_FAILURE
config HWPOISON_INJECT
tristate "HWPoison pages injector"
- depends on MEMORY_FAILURE && DEBUG_KERNEL
+ depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS
select PROC_PAGE_MONITOR
config NOMMU_INITIAL_TRIM_EXCESS
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 65f38c218207..e91b81b63670 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -402,7 +402,7 @@ static void clear_huge_page(struct page *page,
{
int i;
- if (unlikely(sz > MAX_ORDER_NR_PAGES)) {
+ if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
clear_gigantic_page(page, addr, sz);
return;
}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 13f33b3081ec..5b069e4f5e48 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -93,6 +93,7 @@
#include <linux/nodemask.h>
#include <linux/mm.h>
#include <linux/workqueue.h>
+#include <linux/crc32.h>
#include <asm/sections.h>
#include <asm/processor.h>
@@ -108,7 +109,6 @@
#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */
#define SECS_FIRST_SCAN 60 /* delay before the first scan */
#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
-#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */
#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */
#define BYTES_PER_POINTER sizeof(void *)
@@ -119,8 +119,8 @@
/* scanning area inside a memory block */
struct kmemleak_scan_area {
struct hlist_node node;
- unsigned long offset;
- size_t length;
+ unsigned long start;
+ size_t size;
};
#define KMEMLEAK_GREY 0
@@ -149,6 +149,8 @@ struct kmemleak_object {
int min_count;
/* the total number of pointers found pointing to this object */
int count;
+ /* checksum for detecting modified objects */
+ u32 checksum;
/* memory ranges to be scanned inside an object (empty for all) */
struct hlist_head area_list;
unsigned long trace[MAX_TRACE];
@@ -164,8 +166,6 @@ struct kmemleak_object {
#define OBJECT_REPORTED (1 << 1)
/* flag set to not scan the object */
#define OBJECT_NO_SCAN (1 << 2)
-/* flag set on newly allocated objects */
-#define OBJECT_NEW (1 << 3)
/* number of bytes to print per line; must be 16 or 32 */
#define HEX_ROW_SIZE 16
@@ -241,8 +241,6 @@ struct early_log {
const void *ptr; /* allocated/freed memory block */
size_t size; /* memory block size */
int min_count; /* minimum reference count */
- unsigned long offset; /* scan area offset */
- size_t length; /* scan area length */
unsigned long trace[MAX_TRACE]; /* stack trace */
unsigned int trace_len; /* stack trace length */
};
@@ -323,11 +321,6 @@ static bool color_gray(const struct kmemleak_object *object)
object->count >= object->min_count;
}
-static bool color_black(const struct kmemleak_object *object)
-{
- return object->min_count == KMEMLEAK_BLACK;
-}
-
/*
* Objects are considered unreferenced only if their color is white, they have
* not be deleted and have a minimum age to avoid false positives caused by
@@ -335,7 +328,7 @@ static bool color_black(const struct kmemleak_object *object)
*/
static bool unreferenced_object(struct kmemleak_object *object)
{
- return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
+ return (color_white(object) && object->flags & OBJECT_ALLOCATED) &&
time_before_eq(object->jiffies + jiffies_min_age,
jiffies_last_scan);
}
@@ -348,11 +341,13 @@ static void print_unreferenced(struct seq_file *seq,
struct kmemleak_object *object)
{
int i;
+ unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
object->pointer, object->size);
- seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n",
- object->comm, object->pid, object->jiffies);
+ seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
+ object->comm, object->pid, object->jiffies,
+ msecs_age / 1000, msecs_age % 1000);
hex_dump_object(seq, object);
seq_printf(seq, " backtrace:\n");
@@ -381,6 +376,7 @@ static void dump_object_info(struct kmemleak_object *object)
pr_notice(" min_count = %d\n", object->min_count);
pr_notice(" count = %d\n", object->count);
pr_notice(" flags = 0x%lx\n", object->flags);
+ pr_notice(" checksum = %d\n", object->checksum);
pr_notice(" backtrace:\n");
print_stack_trace(&trace, 4);
}
@@ -522,12 +518,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
INIT_HLIST_HEAD(&object->area_list);
spin_lock_init(&object->lock);
atomic_set(&object->use_count, 1);
- object->flags = OBJECT_ALLOCATED | OBJECT_NEW;
+ object->flags = OBJECT_ALLOCATED;
object->pointer = ptr;
object->size = size;
object->min_count = min_count;
- object->count = -1; /* no color initially */
+ object->count = 0; /* white color initially */
object->jiffies = jiffies;
+ object->checksum = 0;
/* task information */
if (in_irq()) {
@@ -720,14 +717,13 @@ static void make_black_object(unsigned long ptr)
* Add a scanning area to the object. If at least one such area is added,
* kmemleak will only scan these ranges rather than the whole memory block.
*/
-static void add_scan_area(unsigned long ptr, unsigned long offset,
- size_t length, gfp_t gfp)
+static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
{
unsigned long flags;
struct kmemleak_object *object;
struct kmemleak_scan_area *area;
- object = find_and_get_object(ptr, 0);
+ object = find_and_get_object(ptr, 1);
if (!object) {
kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n",
ptr);
@@ -741,7 +737,7 @@ static void add_scan_area(unsigned long ptr, unsigned long offset,
}
spin_lock_irqsave(&object->lock, flags);
- if (offset + length > object->size) {
+ if (ptr + size > object->pointer + object->size) {
kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
dump_object_info(object);
kmem_cache_free(scan_area_cache, area);
@@ -749,8 +745,8 @@ static void add_scan_area(unsigned long ptr, unsigned long offset,
}
INIT_HLIST_NODE(&area->node);
- area->offset = offset;
- area->length = length;
+ area->start = ptr;
+ area->size = size;
hlist_add_head(&area->node, &object->area_list);
out_unlock:
@@ -786,7 +782,7 @@ static void object_no_scan(unsigned long ptr)
* processed later once kmemleak is fully initialized.
*/
static void __init log_early(int op_type, const void *ptr, size_t size,
- int min_count, unsigned long offset, size_t length)
+ int min_count)
{
unsigned long flags;
struct early_log *log;
@@ -808,8 +804,6 @@ static void __init log_early(int op_type, const void *ptr, size_t size,
log->ptr = ptr;
log->size = size;
log->min_count = min_count;
- log->offset = offset;
- log->length = length;
if (op_type == KMEMLEAK_ALLOC)
log->trace_len = __save_stack_trace(log->trace);
crt_early_log++;
@@ -858,7 +852,7 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
create_object((unsigned long)ptr, size, min_count, gfp);
else if (atomic_read(&kmemleak_early_log))
- log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0);
+ log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc);
@@ -873,7 +867,7 @@ void __ref kmemleak_free(const void *ptr)
if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
delete_object_full((unsigned long)ptr);
else if (atomic_read(&kmemleak_early_log))
- log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0);
+ log_early(KMEMLEAK_FREE, ptr, 0, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free);
@@ -888,7 +882,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
delete_object_part((unsigned long)ptr, size);
else if (atomic_read(&kmemleak_early_log))
- log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0);
+ log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free_part);
@@ -903,7 +897,7 @@ void __ref kmemleak_not_leak(const void *ptr)
if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
make_gray_object((unsigned long)ptr);
else if (atomic_read(&kmemleak_early_log))
- log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0);
+ log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_not_leak);
@@ -919,22 +913,21 @@ void __ref kmemleak_ignore(const void *ptr)
if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
make_black_object((unsigned long)ptr);
else if (atomic_read(&kmemleak_early_log))
- log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0);
+ log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_ignore);
/*
* Limit the range to be scanned in an allocated memory block.
*/
-void __ref kmemleak_scan_area(const void *ptr, unsigned long offset,
- size_t length, gfp_t gfp)
+void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
- add_scan_area((unsigned long)ptr, offset, length, gfp);
+ add_scan_area((unsigned long)ptr, size, gfp);
else if (atomic_read(&kmemleak_early_log))
- log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length);
+ log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
}
EXPORT_SYMBOL(kmemleak_scan_area);
@@ -948,11 +941,25 @@ void __ref kmemleak_no_scan(const void *ptr)
if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
object_no_scan((unsigned long)ptr);
else if (atomic_read(&kmemleak_early_log))
- log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0);
+ log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_no_scan);
/*
+ * Update an object's checksum and return true if it was modified.
+ */
+static bool update_checksum(struct kmemleak_object *object)
+{
+ u32 old_csum = object->checksum;
+
+ if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
+ return false;
+
+ object->checksum = crc32(0, (void *)object->pointer, object->size);
+ return object->checksum != old_csum;
+}
+
+/*
* Memory scanning is a long process and it needs to be interruptable. This
* function checks whether such interrupt condition occured.
*/
@@ -1031,11 +1038,14 @@ static void scan_block(void *_start, void *_end,
* added to the gray_list.
*/
object->count++;
- if (color_gray(object))
+ if (color_gray(object)) {
list_add_tail(&object->gray_list, &gray_list);
- else
- put_object(object);
+ spin_unlock_irqrestore(&object->lock, flags);
+ continue;
+ }
+
spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
}
}
@@ -1075,14 +1085,47 @@ static void scan_object(struct kmemleak_object *object)
}
} else
hlist_for_each_entry(area, elem, &object->area_list, node)
- scan_block((void *)(object->pointer + area->offset),
- (void *)(object->pointer + area->offset
- + area->length), object, 0);
+ scan_block((void *)area->start,
+ (void *)(area->start + area->size),
+ object, 0);
out:
spin_unlock_irqrestore(&object->lock, flags);
}
/*
+ * Scan the objects already referenced (gray objects). More objects will be
+ * referenced and, if there are no memory leaks, all the objects are scanned.
+ */
+static void scan_gray_list(void)
+{
+ struct kmemleak_object *object, *tmp;
+
+ /*
+ * The list traversal is safe for both tail additions and removals
+ * from inside the loop. The kmemleak objects cannot be freed from
+ * outside the loop because their use_count was incremented.
+ */
+ object = list_entry(gray_list.next, typeof(*object), gray_list);
+ while (&object->gray_list != &gray_list) {
+ cond_resched();
+
+ /* may add new objects to the list */
+ if (!scan_should_stop())
+ scan_object(object);
+
+ tmp = list_entry(object->gray_list.next, typeof(*object),
+ gray_list);
+
+ /* remove the object from the list and release it */
+ list_del(&object->gray_list);
+ put_object(object);
+
+ object = tmp;
+ }
+ WARN_ON(!list_empty(&gray_list));
+}
+
+/*
* Scan data sections and all the referenced memory blocks allocated via the
* kernel's standard allocators. This function must be called with the
* scan_mutex held.
@@ -1090,10 +1133,9 @@ out:
static void kmemleak_scan(void)
{
unsigned long flags;
- struct kmemleak_object *object, *tmp;
+ struct kmemleak_object *object;
int i;
int new_leaks = 0;
- int gray_list_pass = 0;
jiffies_last_scan = jiffies;
@@ -1114,7 +1156,6 @@ static void kmemleak_scan(void)
#endif
/* reset the reference count (whiten the object) */
object->count = 0;
- object->flags &= ~OBJECT_NEW;
if (color_gray(object) && get_object(object))
list_add_tail(&object->gray_list, &gray_list);
@@ -1172,62 +1213,36 @@ static void kmemleak_scan(void)
/*
* Scan the objects already referenced from the sections scanned
- * above. More objects will be referenced and, if there are no memory
- * leaks, all the objects will be scanned. The list traversal is safe
- * for both tail additions and removals from inside the loop. The
- * kmemleak objects cannot be freed from outside the loop because their
- * use_count was increased.
+ * above.
*/
-repeat:
- object = list_entry(gray_list.next, typeof(*object), gray_list);
- while (&object->gray_list != &gray_list) {
- cond_resched();
-
- /* may add new objects to the list */
- if (!scan_should_stop())
- scan_object(object);
-
- tmp = list_entry(object->gray_list.next, typeof(*object),
- gray_list);
-
- /* remove the object from the list and release it */
- list_del(&object->gray_list);
- put_object(object);
-
- object = tmp;
- }
-
- if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES)
- goto scan_end;
+ scan_gray_list();
/*
- * Check for new objects allocated during this scanning and add them
- * to the gray list.
+ * Check for new or unreferenced objects modified since the previous
+ * scan and color them gray until the next scan.
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
spin_lock_irqsave(&object->lock, flags);
- if ((object->flags & OBJECT_NEW) && !color_black(object) &&
- get_object(object)) {
- object->flags &= ~OBJECT_NEW;
+ if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
+ && update_checksum(object) && get_object(object)) {
+ /* color it gray temporarily */
+ object->count = object->min_count;
list_add_tail(&object->gray_list, &gray_list);
}
spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
- if (!list_empty(&gray_list))
- goto repeat;
-
-scan_end:
- WARN_ON(!list_empty(&gray_list));
+ /*
+ * Re-scan the gray list for modified unreferenced objects.
+ */
+ scan_gray_list();
/*
- * If scanning was stopped or new objects were being allocated at a
- * higher rate than gray list scanning, do not report any new
- * unreferenced objects.
+ * If scanning was stopped do not report any new unreferenced objects.
*/
- if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES)
+ if (scan_should_stop())
return;
/*
@@ -1642,8 +1657,7 @@ void __init kmemleak_init(void)
kmemleak_ignore(log->ptr);
break;
case KMEMLEAK_SCAN_AREA:
- kmemleak_scan_area(log->ptr, log->offset, log->length,
- GFP_KERNEL);
+ kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL);
break;
case KMEMLEAK_NO_SCAN:
kmemleak_no_scan(log->ptr);
diff --git a/mm/maccess.c b/mm/maccess.c
index 9073695ff25f..4e348dbaecd7 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -14,7 +14,11 @@
* Safely read from address @src to the buffer at @dst. If a kernel fault
* happens, handle that and return -EFAULT.
*/
-long probe_kernel_read(void *dst, void *src, size_t size)
+
+long __weak probe_kernel_read(void *dst, void *src, size_t size)
+ __attribute__((alias("__probe_kernel_read")));
+
+long __probe_kernel_read(void *dst, void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
@@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
* Safely write to address @dst from the buffer at @src. If a kernel fault
* happens, handle that and return -EFAULT.
*/
-long notrace __weak probe_kernel_write(void *dst, void *src, size_t size)
+long __weak probe_kernel_write(void *dst, void *src, size_t size)
+ __attribute__((alias("__probe_kernel_write")));
+
+long __probe_kernel_write(void *dst, void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6a0466ed5bfd..17299fd4577c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -52,6 +52,8 @@ int sysctl_memory_failure_recovery __read_mostly = 1;
atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
+#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
+
u32 hwpoison_filter_enable = 0;
u32 hwpoison_filter_dev_major = ~0U;
u32 hwpoison_filter_dev_minor = ~0U;
@@ -164,6 +166,13 @@ int hwpoison_filter(struct page *p)
return 0;
}
+#else
+int hwpoison_filter(struct page *p)
+{
+ return 0;
+}
+#endif
+
EXPORT_SYMBOL_GPL(hwpoison_filter);
/*
diff --git a/mm/mmap.c b/mm/mmap.c
index d9c77b2dbe9d..ee2298936fe6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1043,6 +1043,46 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
}
EXPORT_SYMBOL(do_mmap_pgoff);
+SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, pgoff)
+{
+ struct file *file = NULL;
+ unsigned long retval = -EBADF;
+
+ if (!(flags & MAP_ANONYMOUS)) {
+ if (unlikely(flags & MAP_HUGETLB))
+ return -EINVAL;
+ file = fget(fd);
+ if (!file)
+ goto out;
+ } else if (flags & MAP_HUGETLB) {
+ struct user_struct *user = NULL;
+ /*
+ * VM_NORESERVE is used because the reservations will be
+ * taken when vm_ops->mmap() is called
+ * A dummy user value is used because we are not locking
+ * memory so no accounting is necessary
+ */
+ len = ALIGN(len, huge_page_size(&default_hstate));
+ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
+ &user, HUGETLB_ANONHUGE_INODE);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ }
+
+ flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+
+ down_write(&current->mm->mmap_sem);
+ retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+ up_write(&current->mm->mmap_sem);
+
+ if (file)
+ fput(file);
+out:
+ return retval;
+}
+
/*
* Some shared mappigns will want the pages marked read-only
* to track write events. If so, we'll downgrade vm_page_prot
diff --git a/mm/nommu.c b/mm/nommu.c
index 8687973462bb..17773862619b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/*
* Ok, looks good - let it rip.
*/
+ flush_icache_range(mm->brk, brk);
return mm->brk = brk;
}
@@ -1353,10 +1354,14 @@ unsigned long do_mmap_pgoff(struct file *file,
share:
add_vma_to_mm(current->mm, vma);
- up_write(&nommu_region_sem);
+ /* we flush the region from the icache only when the first executable
+ * mapping of it is made */
+ if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
+ flush_icache_range(region->vm_start, region->vm_end);
+ region->vm_icache_flushed = true;
+ }
- if (prot & PROT_EXEC)
- flush_icache_range(result, result + len);
+ up_write(&nommu_region_sem);
kleave(" = %lx", result);
return result;
@@ -1398,6 +1403,31 @@ error_getting_region:
}
EXPORT_SYMBOL(do_mmap_pgoff);
+SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, pgoff)
+{
+ struct file *file = NULL;
+ unsigned long retval = -EBADF;
+
+ if (!(flags & MAP_ANONYMOUS)) {
+ file = fget(fd);
+ if (!file)
+ goto out;
+ }
+
+ flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+
+ down_write(&current->mm->mmap_sem);
+ retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+ up_write(&current->mm->mmap_sem);
+
+ if (file)
+ fput(file);
+out:
+ return retval;
+}
+
/*
* split a vma into two pieces at address 'addr', a new vma is allocated either
* for the first part or the tail.
@@ -1891,9 +1921,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
/* only read or write mappings where it is permitted */
if (write && vma->vm_flags & VM_MAYWRITE)
- len -= copy_to_user((void *) addr, buf, len);
+ copy_to_user_page(vma, NULL, addr,
+ (void *) addr, buf, len);
else if (!write && vma->vm_flags & VM_MAYREAD)
- len -= copy_from_user(buf, (void *) addr, len);
+ copy_from_user_page(vma, NULL, addr,
+ buf, (void *) addr, len);
else
len = 0;
} else {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 74af449b1f1d..4e9f5cc5fb59 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
#include <linux/page_cgroup.h>
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
+#include <linux/memory.h>
#include <trace/events/kmem.h>
#include <asm/tlbflush.h>
@@ -2401,13 +2402,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
{
char saved_string[NUMA_ZONELIST_ORDER_LEN];
int ret;
+ static DEFINE_MUTEX(zl_order_mutex);
+ mutex_lock(&zl_order_mutex);
if (write)
- strncpy(saved_string, (char*)table->data,
- NUMA_ZONELIST_ORDER_LEN);
+ strcpy(saved_string, (char*)table->data);
ret = proc_dostring(table, write, buffer, length, ppos);
if (ret)
- return ret;
+ goto out;
if (write) {
int oldval = user_zonelist_order;
if (__parse_numa_zonelist_order((char*)table->data)) {
@@ -2420,7 +2422,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
} else if (oldval != user_zonelist_order)
build_all_zonelists();
}
- return 0;
+out:
+ mutex_unlock(&zl_order_mutex);
+ return ret;
}
@@ -3579,7 +3583,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
* then all holes in the requested range will be accounted for.
*/
-static unsigned long __meminit __absent_pages_in_range(int nid,
+unsigned long __meminit __absent_pages_in_range(int nid,
unsigned long range_start_pfn,
unsigned long range_end_pfn)
{
@@ -4108,7 +4112,7 @@ static int __init cmp_node_active_region(const void *a, const void *b)
}
/* sort the node_map by start_pfn */
-static void __init sort_node_map(void)
+void __init sort_node_map(void)
{
sort(early_node_map, (size_t)nr_nodemap_entries,
sizeof(struct node_active_region),
@@ -5008,23 +5012,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
int set_migratetype_isolate(struct page *page)
{
struct zone *zone;
- unsigned long flags;
+ struct page *curr_page;
+ unsigned long flags, pfn, iter;
+ unsigned long immobile = 0;
+ struct memory_isolate_notify arg;
+ int notifier_ret;
int ret = -EBUSY;
int zone_idx;
zone = page_zone(page);
zone_idx = zone_idx(zone);
+
spin_lock_irqsave(&zone->lock, flags);
+ if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
+ zone_idx == ZONE_MOVABLE) {
+ ret = 0;
+ goto out;
+ }
+
+ pfn = page_to_pfn(page);
+ arg.start_pfn = pfn;
+ arg.nr_pages = pageblock_nr_pages;
+ arg.pages_found = 0;
+
/*
- * In future, more migrate types will be able to be isolation target.
+ * It may be possible to isolate a pageblock even if the
+ * migratetype is not MIGRATE_MOVABLE. The memory isolation
+ * notifier chain is used by balloon drivers to return the
+ * number of pages in a range that are held by the balloon
+ * driver to shrink memory. If all the pages are accounted for
+ * by balloons, are free, or on the LRU, isolation can continue.
+ * Later, for example, when memory hotplug notifier runs, these
+ * pages reported as "can be isolated" should be isolated(freed)
+ * by the balloon driver through the memory notifier chain.
*/
- if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
- zone_idx != ZONE_MOVABLE)
+ notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
+ notifier_ret = notifier_to_errno(notifier_ret);
+ if (notifier_ret || !arg.pages_found)
goto out;
- set_pageblock_migratetype(page, MIGRATE_ISOLATE);
- move_freepages_block(zone, page, MIGRATE_ISOLATE);
- ret = 0;
+
+ for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
+ if (!pfn_valid_within(pfn))
+ continue;
+
+ curr_page = pfn_to_page(iter);
+ if (!page_count(curr_page) || PageLRU(curr_page))
+ continue;
+
+ immobile++;
+ }
+
+ if (arg.pages_found == immobile)
+ ret = 0;
+
out:
+ if (!ret) {
+ set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+ move_freepages_block(zone, page, MIGRATE_ISOLATE);
+ }
+
spin_unlock_irqrestore(&zone->lock, flags);
if (!ret)
drain_all_pages();
diff --git a/mm/percpu.c b/mm/percpu.c
index 442010cc91c6..083e7c91e5f6 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1271,7 +1271,7 @@ static void pcpu_reclaim(struct work_struct *work)
*/
void free_percpu(void *ptr)
{
- void *addr = __pcpu_ptr_to_addr(ptr);
+ void *addr;
struct pcpu_chunk *chunk;
unsigned long flags;
int off;
@@ -1279,6 +1279,8 @@ void free_percpu(void *ptr)
if (!ptr)
return;
+ addr = __pcpu_ptr_to_addr(ptr);
+
spin_lock_irqsave(&pcpu_lock, flags);
chunk = pcpu_chunk_addr_search(addr);
diff --git a/mm/readahead.c b/mm/readahead.c
index aa1aa2345235..033bc135a41f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -547,5 +547,17 @@ page_cache_async_readahead(struct address_space *mapping,
/* do read-ahead */
ondemand_readahead(mapping, ra, filp, true, offset, req_size);
+
+#ifdef CONFIG_BLOCK
+ /*
+ * Normally the current page is !uptodate and lock_page() will be
+ * immediately called to implicitly unplug the device. However this
+ * is not always true for RAID conifgurations, where data arrives
+ * not strictly in their submission order. In this case we need to
+ * explicitly kick off the IO.
+ */
+ if (PageUptodate(page))
+ blk_run_backing_dev(mapping->backing_dev_info, NULL);
+#endif
}
EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/slab.c b/mm/slab.c
index 3f4822938f46..7451bdacaf18 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -654,7 +654,7 @@ static void init_node_lock_keys(int q)
l3 = s->cs_cachep->nodelists[q];
if (!l3 || OFF_SLAB(s->cs_cachep))
- return;
+ continue;
lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
alc = l3->alien;
/*
@@ -665,7 +665,7 @@ static void init_node_lock_keys(int q)
* for alloc_alien_cache,
*/
if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
- return;
+ continue;
for_each_node(r) {
if (alc[r])
lockdep_set_class(&alc[r]->lock,
@@ -1132,7 +1132,7 @@ static void __cpuinit cpuup_canceled(long cpu)
if (nc)
free_block(cachep, nc->entry, nc->avail, node);
- if (!cpus_empty(*mask)) {
+ if (!cpumask_empty(mask)) {
spin_unlock_irq(&l3->list_lock);
goto free_array_cache;
}
@@ -2275,9 +2275,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
/*
* Determine if the slab management is 'on' or 'off' slab.
* (bootstrapping cannot cope with offslab caches so don't do
- * it too early on.)
+ * it too early on. Always use on-slab management when
+ * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
*/
- if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
+ if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
+ !(flags & SLAB_NOLEAKTRACE))
/*
* Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
@@ -2596,8 +2598,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
* kmemleak does not treat the ->s_mem pointer as a reference
* to the object. Otherwise we will not report the leak.
*/
- kmemleak_scan_area(slabp, offsetof(struct slab, list),
- sizeof(struct list_head), local_flags);
+ kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
+ local_flags);
if (!slabp)
return NULL;
} else {
diff --git a/mm/util.c b/mm/util.c
index b377ce430803..7c35ad95f927 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,10 +4,6 @@
#include <linux/module.h>
#include <linux/err.h>
#include <linux/sched.h>
-#include <linux/hugetlb.h>
-#include <linux/syscalls.h>
-#include <linux/mman.h>
-#include <linux/file.h>
#include <asm/uaccess.h>
#define CREATE_TRACE_POINTS
@@ -272,46 +268,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
-SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
- unsigned long, prot, unsigned long, flags,
- unsigned long, fd, unsigned long, pgoff)
-{
- struct file * file = NULL;
- unsigned long retval = -EBADF;
-
- if (!(flags & MAP_ANONYMOUS)) {
- if (unlikely(flags & MAP_HUGETLB))
- return -EINVAL;
- file = fget(fd);
- if (!file)
- goto out;
- } else if (flags & MAP_HUGETLB) {
- struct user_struct *user = NULL;
- /*
- * VM_NORESERVE is used because the reservations will be
- * taken when vm_ops->mmap() is called
- * A dummy user value is used because we are not locking
- * memory so no accounting is necessary
- */
- len = ALIGN(len, huge_page_size(&default_hstate));
- file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
- &user, HUGETLB_ANONHUGE_INODE);
- if (IS_ERR(file))
- return PTR_ERR(file);
- }
-
- flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-
- down_write(&current->mm->mmap_sem);
- retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
- up_write(&current->mm->mmap_sem);
-
- if (file)
- fput(file);
-out:
- return retval;
-}
-
/* Tracepoints definitions. */
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);