diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 10 | ||||
-rw-r--r-- | mm/Makefile | 7 | ||||
-rw-r--r-- | mm/backing-dev.c | 520 | ||||
-rw-r--r-- | mm/bootmem.c | 13 | ||||
-rw-r--r-- | mm/bounce.c | 2 | ||||
-rw-r--r-- | mm/compaction.c | 7 | ||||
-rw-r--r-- | mm/dmapool.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 44 | ||||
-rw-r--r-- | mm/fremap.c | 7 | ||||
-rw-r--r-- | mm/highmem.c | 61 | ||||
-rw-r--r-- | mm/hugetlb.c | 312 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 15 | ||||
-rw-r--r-- | mm/init-mm.c | 6 | ||||
-rw-r--r-- | mm/internal.h | 2 | ||||
-rw-r--r-- | mm/kmemleak.c | 100 | ||||
-rw-r--r-- | mm/ksm.c | 80 | ||||
-rw-r--r-- | mm/maccess.c | 2 | ||||
-rw-r--r-- | mm/memblock.c | 837 | ||||
-rw-r--r-- | mm/memcontrol.c | 870 | ||||
-rw-r--r-- | mm/memory-failure.c | 327 | ||||
-rw-r--r-- | mm/memory.c | 136 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 66 | ||||
-rw-r--r-- | mm/mempolicy.c | 99 | ||||
-rw-r--r-- | mm/migrate.c | 259 | ||||
-rw-r--r-- | mm/mlock.c | 13 | ||||
-rw-r--r-- | mm/mmap.c | 77 | ||||
-rw-r--r-- | mm/mmzone.c | 21 | ||||
-rw-r--r-- | mm/mremap.c | 4 | ||||
-rw-r--r-- | mm/nommu.c | 63 | ||||
-rw-r--r-- | mm/oom_kill.c | 725 | ||||
-rw-r--r-- | mm/page-writeback.c | 309 | ||||
-rw-r--r-- | mm/page_alloc.c | 255 | ||||
-rw-r--r-- | mm/page_io.c | 2 | ||||
-rw-r--r-- | mm/page_isolation.c | 3 | ||||
-rw-r--r-- | mm/percpu-km.c | 8 | ||||
-rw-r--r-- | mm/percpu.c | 476 | ||||
-rw-r--r-- | mm/percpu_up.c | 30 | ||||
-rw-r--r-- | mm/rmap.c | 227 | ||||
-rw-r--r-- | mm/shmem.c | 156 | ||||
-rw-r--r-- | mm/slab.c | 11 | ||||
-rw-r--r-- | mm/slob.c | 18 | ||||
-rw-r--r-- | mm/slub.c | 871 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 11 | ||||
-rw-r--r-- | mm/swap.c | 1 | ||||
-rw-r--r-- | mm/swapfile.c | 78 | ||||
-rw-r--r-- | mm/truncate.c | 38 | ||||
-rw-r--r-- | mm/util.c | 24 | ||||
-rw-r--r-- | mm/vmalloc.c | 80 | ||||
-rw-r--r-- | mm/vmscan.c | 771 | ||||
-rw-r--r-- | mm/vmstat.c | 68 |
50 files changed, 5193 insertions, 2931 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index f4e516e9c37c..c2c8a4a11898 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -189,7 +189,7 @@ config COMPACTION config MIGRATION bool "Page migration" def_bool y - depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE + depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION help Allows the migration of the physical location of pages of processes while the virtual addresses are not changed. This is useful in @@ -301,3 +301,11 @@ config NOMMU_INITIAL_TRIM_EXCESS of 1 says that all excess pages should be trimmed. See Documentation/nommu-mmap.txt for more information. + +# +# UP and nommu archs use km based percpu allocator +# +config NEED_PER_CPU_KM + depends on !SMP + bool + default y diff --git a/mm/Makefile b/mm/Makefile index 34b2546a9e37..f73f75a29f82 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o mm_init.o mmu_context.o \ + page_isolation.o mm_init.o mmu_context.o percpu.o \ $(mmu-y) obj-y += init-mm.o @@ -36,11 +36,6 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -ifdef CONFIG_SMP -obj-y += percpu.o -else -obj-y += percpu_up.o -endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 123bcef13e51..027100d30227 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/writeback.h> #include <linux/device.h> +#include <trace/events/writeback.h> static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); @@ -29,6 +30,7 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info); struct backing_dev_info noop_backing_dev_info = { .name = "noop", + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; EXPORT_SYMBOL_GPL(noop_backing_dev_info); @@ -49,8 +51,6 @@ static struct timer_list sync_supers_timer; static int bdi_sync_supers(void *); static void sync_supers_timer_fn(unsigned long); -static void bdi_add_default_flusher_task(struct backing_dev_info *bdi); - #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -65,31 +65,25 @@ static void bdi_debug_init(void) static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; - struct bdi_writeback *wb; + struct bdi_writeback *wb = &bdi->wb; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; struct inode *inode; - /* - * inode lock is enough here, the bdi->wb_list is protected by - * RCU on the reader side - */ nr_wb = nr_dirty = nr_io = nr_more_io = 0; spin_lock(&inode_lock); - list_for_each_entry(wb, &bdi->wb_list, list) { - nr_wb++; - list_for_each_entry(inode, &wb->b_dirty, i_list) - nr_dirty++; - list_for_each_entry(inode, &wb->b_io, i_list) - nr_io++; - list_for_each_entry(inode, &wb->b_more_io, i_list) - nr_more_io++; - } + list_for_each_entry(inode, &wb->b_dirty, i_wb_list) + nr_dirty++; + list_for_each_entry(inode, &wb->b_io, i_wb_list) + nr_io++; + list_for_each_entry(inode, &wb->b_more_io, i_wb_list) + nr_more_io++; spin_unlock(&inode_lock); - get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); + global_dirty_limits(&background_thresh, &dirty_thresh); + bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, @@ -98,19 +92,16 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" "BackgroundThresh: %8lu kB\n" - "WritebackThreads: %8lu\n" "b_dirty: %8lu\n" "b_io: %8lu\n" "b_more_io: %8lu\n" "bdi_list: %8u\n" - "state: %8lx\n" - "wb_list: %8u\n", + "state: %8lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), K(bdi_thresh), K(dirty_thresh), - K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, - !list_empty(&bdi->bdi_list), bdi->state, - !list_empty(&bdi->wb_list)); + K(background_thresh), nr_dirty, nr_io, nr_more_io, + !list_empty(&bdi->bdi_list), bdi->state); #undef K return 0; @@ -247,89 +238,18 @@ static int __init default_bdi_init(void) sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); BUG_ON(IS_ERR(sync_supers_tsk)); - init_timer(&sync_supers_timer); setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); bdi_arm_supers_timer(); err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); + err = bdi_init(&noop_backing_dev_info); return err; } subsys_initcall(default_bdi_init); -static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) -{ - memset(wb, 0, sizeof(*wb)); - - wb->bdi = bdi; - wb->last_old_flush = jiffies; - INIT_LIST_HEAD(&wb->b_dirty); - INIT_LIST_HEAD(&wb->b_io); - INIT_LIST_HEAD(&wb->b_more_io); -} - -static void bdi_task_init(struct backing_dev_info *bdi, - struct bdi_writeback *wb) -{ - struct task_struct *tsk = current; - - spin_lock(&bdi->wb_lock); - list_add_tail_rcu(&wb->list, &bdi->wb_list); - spin_unlock(&bdi->wb_lock); - - tsk->flags |= PF_FLUSHER | PF_SWAPWRITE; - set_freezable(); - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(tsk, 0); -} - -static int bdi_start_fn(void *ptr) -{ - struct bdi_writeback *wb = ptr; - struct backing_dev_info *bdi = wb->bdi; - int ret; - - /* - * Add us to the active bdi_list - */ - spin_lock_bh(&bdi_lock); - list_add_rcu(&bdi->bdi_list, &bdi_list); - spin_unlock_bh(&bdi_lock); - - bdi_task_init(bdi, wb); - - /* - * Clear pending bit and wakeup anybody waiting to tear us down - */ - clear_bit(BDI_pending, &bdi->state); - smp_mb__after_clear_bit(); - wake_up_bit(&bdi->state, BDI_pending); - - ret = bdi_writeback_task(wb); - - /* - * Remove us from the list - */ - spin_lock(&bdi->wb_lock); - list_del_rcu(&wb->list); - spin_unlock(&bdi->wb_lock); - - /* - * Flush any work that raced with us exiting. No new work - * will be added, since this bdi isn't discoverable anymore. - */ - if (!list_empty(&bdi->work_list)) - wb_do_writeback(wb, 1); - - wb->task = NULL; - return ret; -} - int bdi_has_dirty_io(struct backing_dev_info *bdi) { return wb_has_dirty_io(&bdi->wb); @@ -348,10 +268,10 @@ static void bdi_flush_io(struct backing_dev_info *bdi) } /* - * kupdated() used to do this. We cannot do it from the bdi_forker_task() + * kupdated() used to do this. We cannot do it from the bdi_forker_thread() * or we risk deadlocking on ->s_umount. The longer term solution would be * to implement sync_supers_bdi() or similar and simply do it from the - * bdi writeback tasks individually. + * bdi writeback thread individually. */ static int bdi_sync_supers(void *unused) { @@ -387,144 +307,201 @@ static void sync_supers_timer_fn(unsigned long unused) bdi_arm_supers_timer(); } -static int bdi_forker_task(void *ptr) +static void wakeup_timer_fn(unsigned long data) +{ + struct backing_dev_info *bdi = (struct backing_dev_info *)data; + + spin_lock_bh(&bdi->wb_lock); + if (bdi->wb.task) { + trace_writeback_wake_thread(bdi); + wake_up_process(bdi->wb.task); + } else { + /* + * When bdi tasks are inactive for long time, they are killed. + * In this case we have to wake-up the forker thread which + * should create and run the bdi thread. + */ + trace_writeback_wake_forker_thread(bdi); + wake_up_process(default_backing_dev_info.wb.task); + } + spin_unlock_bh(&bdi->wb_lock); +} + +/* + * This function is used when the first inode for this bdi is marked dirty. It + * wakes-up the corresponding bdi thread which should then take care of the + * periodic background write-out of dirty inodes. Since the write-out would + * starts only 'dirty_writeback_interval' centisecs from now anyway, we just + * set up a timer which wakes the bdi thread up later. + * + * Note, we wouldn't bother setting up the timer, but this function is on the + * fast-path (used by '__mark_inode_dirty()'), so we save few context switches + * by delaying the wake-up. + */ +void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) +{ + unsigned long timeout; + + timeout = msecs_to_jiffies(dirty_writeback_interval * 10); + mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); +} + +/* + * Calculate the longest interval (jiffies) bdi threads are allowed to be + * inactive. + */ +static unsigned long bdi_longest_inactive(void) +{ + unsigned long interval; + + interval = msecs_to_jiffies(dirty_writeback_interval * 10); + return max(5UL * 60 * HZ, interval); +} + +static int bdi_forker_thread(void *ptr) { struct bdi_writeback *me = ptr; - bdi_task_init(me->bdi, me); + current->flags |= PF_SWAPWRITE; + set_freezable(); + + /* + * Our parent may run at a different priority, just set us to normal + */ + set_user_nice(current, 0); for (;;) { - struct backing_dev_info *bdi, *tmp; - struct bdi_writeback *wb; + struct task_struct *task = NULL; + struct backing_dev_info *bdi; + enum { + NO_ACTION, /* Nothing to do */ + FORK_THREAD, /* Fork bdi thread */ + KILL_THREAD, /* Kill inactive bdi thread */ + } action = NO_ACTION; /* * Temporary measure, we want to make sure we don't see * dirty data on the default backing_dev_info */ - if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) + if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) { + del_timer(&me->wakeup_timer); wb_do_writeback(me, 0); + } spin_lock_bh(&bdi_lock); + set_current_state(TASK_INTERRUPTIBLE); - /* - * Check if any existing bdi's have dirty data without - * a thread registered. If so, set that up. - */ - list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { - if (bdi->wb.task) - continue; - if (list_empty(&bdi->work_list) && - !bdi_has_dirty_io(bdi)) + list_for_each_entry(bdi, &bdi_list, bdi_list) { + bool have_dirty_io; + + if (!bdi_cap_writeback_dirty(bdi) || + bdi_cap_flush_forker(bdi)) continue; - bdi_add_default_flusher_task(bdi); - } + WARN(!test_bit(BDI_registered, &bdi->state), + "bdi %p/%s is not registered!\n", bdi, bdi->name); - set_current_state(TASK_INTERRUPTIBLE); + have_dirty_io = !list_empty(&bdi->work_list) || + wb_has_dirty_io(&bdi->wb); - if (list_empty(&bdi_pending_list)) { - unsigned long wait; + /* + * If the bdi has work to do, but the thread does not + * exist - create it. + */ + if (!bdi->wb.task && have_dirty_io) { + /* + * Set the pending bit - if someone will try to + * unregister this bdi - it'll wait on this bit. + */ + set_bit(BDI_pending, &bdi->state); + action = FORK_THREAD; + break; + } + + spin_lock(&bdi->wb_lock); - spin_unlock_bh(&bdi_lock); - wait = msecs_to_jiffies(dirty_writeback_interval * 10); - if (wait) - schedule_timeout(wait); + /* + * If there is no work to do and the bdi thread was + * inactive long enough - kill it. The wb_lock is taken + * to make sure no-one adds more work to this bdi and + * wakes the bdi thread up. + */ + if (bdi->wb.task && !have_dirty_io && + time_after(jiffies, bdi->wb.last_active + + bdi_longest_inactive())) { + task = bdi->wb.task; + bdi->wb.task = NULL; + spin_unlock(&bdi->wb_lock); + set_bit(BDI_pending, &bdi->state); + action = KILL_THREAD; + break; + } + spin_unlock(&bdi->wb_lock); + } + spin_unlock_bh(&bdi_lock); + + /* Keep working if default bdi still has things to do */ + if (!list_empty(&me->bdi->work_list)) + __set_current_state(TASK_RUNNING); + + switch (action) { + case FORK_THREAD: + __set_current_state(TASK_RUNNING); + task = kthread_create(bdi_writeback_thread, &bdi->wb, + "flush-%s", dev_name(bdi->dev)); + if (IS_ERR(task)) { + /* + * If thread creation fails, force writeout of + * the bdi from the thread. + */ + bdi_flush_io(bdi); + } else { + /* + * The spinlock makes sure we do not lose + * wake-ups when racing with 'bdi_queue_work()'. + * And as soon as the bdi thread is visible, we + * can start it. + */ + spin_lock_bh(&bdi->wb_lock); + bdi->wb.task = task; + spin_unlock_bh(&bdi->wb_lock); + wake_up_process(task); + } + break; + + case KILL_THREAD: + __set_current_state(TASK_RUNNING); + kthread_stop(task); + break; + + case NO_ACTION: + if (!wb_has_dirty_io(me) || !dirty_writeback_interval) + /* + * There are no dirty data. The only thing we + * should now care about is checking for + * inactive bdi threads and killing them. Thus, + * let's sleep for longer time, save energy and + * be friendly for battery-driven devices. + */ + schedule_timeout(bdi_longest_inactive()); else - schedule(); + schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); try_to_freeze(); + /* Back to the main loop */ continue; } - __set_current_state(TASK_RUNNING); - /* - * This is our real job - check for pending entries in - * bdi_pending_list, and create the tasks that got added + * Clear pending bit and wakeup anybody waiting to tear us down. */ - bdi = list_entry(bdi_pending_list.next, struct backing_dev_info, - bdi_list); - list_del_init(&bdi->bdi_list); - spin_unlock_bh(&bdi_lock); - - wb = &bdi->wb; - wb->task = kthread_run(bdi_start_fn, wb, "flush-%s", - dev_name(bdi->dev)); - /* - * If task creation fails, then readd the bdi to - * the pending list and force writeout of the bdi - * from this forker thread. That will free some memory - * and we can try again. - */ - if (IS_ERR(wb->task)) { - wb->task = NULL; - - /* - * Add this 'bdi' to the back, so we get - * a chance to flush other bdi's to free - * memory. - */ - spin_lock_bh(&bdi_lock); - list_add_tail(&bdi->bdi_list, &bdi_pending_list); - spin_unlock_bh(&bdi_lock); - - bdi_flush_io(bdi); - } + clear_bit(BDI_pending, &bdi->state); + smp_mb__after_clear_bit(); + wake_up_bit(&bdi->state, BDI_pending); } return 0; } -static void bdi_add_to_pending(struct rcu_head *head) -{ - struct backing_dev_info *bdi; - - bdi = container_of(head, struct backing_dev_info, rcu_head); - INIT_LIST_HEAD(&bdi->bdi_list); - - spin_lock(&bdi_lock); - list_add_tail(&bdi->bdi_list, &bdi_pending_list); - spin_unlock(&bdi_lock); - - /* - * We are now on the pending list, wake up bdi_forker_task() - * to finish the job and add us back to the active bdi_list - */ - wake_up_process(default_backing_dev_info.wb.task); -} - -/* - * Add the default flusher task that gets created for any bdi - * that has dirty data pending writeout - */ -void static bdi_add_default_flusher_task(struct backing_dev_info *bdi) -{ - if (!bdi_cap_writeback_dirty(bdi)) - return; - - if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) { - printk(KERN_ERR "bdi %p/%s is not registered!\n", - bdi, bdi->name); - return; - } - - /* - * Check with the helper whether to proceed adding a task. Will only - * abort if we two or more simultanous calls to - * bdi_add_default_flusher_task() occured, further additions will block - * waiting for previous additions to finish. - */ - if (!test_and_set_bit(BDI_pending, &bdi->state)) { - list_del_rcu(&bdi->bdi_list); - - /* - * We must wait for the current RCU period to end before - * moving to the pending list. So schedule that operation - * from an RCU callback. - */ - call_rcu(&bdi->rcu_head, bdi_add_to_pending); - } -} - /* * Remove bdi from bdi_list, and ensure that it is no longer visible */ @@ -541,23 +518,16 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...) { va_list args; - int ret = 0; struct device *dev; if (bdi->dev) /* The driver needs to use separate queues per device */ - goto exit; + return 0; va_start(args, fmt); dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); va_end(args); - if (IS_ERR(dev)) { - ret = PTR_ERR(dev); - goto exit; - } - - spin_lock_bh(&bdi_lock); - list_add_tail_rcu(&bdi->bdi_list, &bdi_list); - spin_unlock_bh(&bdi_lock); + if (IS_ERR(dev)) + return PTR_ERR(dev); bdi->dev = dev; @@ -569,21 +539,21 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, if (bdi_cap_flush_forker(bdi)) { struct bdi_writeback *wb = &bdi->wb; - wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s", + wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s", dev_name(dev)); - if (IS_ERR(wb->task)) { - wb->task = NULL; - ret = -ENOMEM; - - bdi_remove_from_list(bdi); - goto exit; - } + if (IS_ERR(wb->task)) + return PTR_ERR(wb->task); } bdi_debug_register(bdi, dev_name(dev)); set_bit(BDI_registered, &bdi->state); -exit: - return ret; + + spin_lock_bh(&bdi_lock); + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); + + trace_writeback_bdi_register(bdi); + return 0; } EXPORT_SYMBOL(bdi_register); @@ -598,31 +568,29 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { - struct bdi_writeback *wb; - if (!bdi_cap_writeback_dirty(bdi)) return; /* - * If setup is pending, wait for that to complete first + * Make sure nobody finds us on the bdi_list anymore */ - wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, - TASK_UNINTERRUPTIBLE); + bdi_remove_from_list(bdi); /* - * Make sure nobody finds us on the bdi_list anymore + * If setup is pending, wait for that to complete first */ - bdi_remove_from_list(bdi); + wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, + TASK_UNINTERRUPTIBLE); /* - * Finally, kill the kernel threads. We don't need to be RCU + * Finally, kill the kernel thread. We don't need to be RCU * safe anymore, since the bdi is gone from visibility. Force * unfreeze of the thread before calling kthread_stop(), otherwise * it would never exet if it is currently stuck in the refrigerator. */ - list_for_each_entry(wb, &bdi->wb_list, list) { - thaw_process(wb->task); - kthread_stop(wb->task); + if (bdi->wb.task) { + thaw_process(bdi->wb.task); + kthread_stop(bdi->wb.task); } } @@ -644,7 +612,9 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { + trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); + del_timer_sync(&bdi->wb.wakeup_timer); if (!bdi_cap_flush_forker(bdi)) bdi_wb_shutdown(bdi); @@ -655,6 +625,18 @@ void bdi_unregister(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_unregister); +static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +{ + memset(wb, 0, sizeof(*wb)); + + wb->bdi = bdi; + wb->last_old_flush = jiffies; + INIT_LIST_HEAD(&wb->b_dirty); + INIT_LIST_HEAD(&wb->b_io); + INIT_LIST_HEAD(&wb->b_more_io); + setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); +} + int bdi_init(struct backing_dev_info *bdi) { int i, err; @@ -665,9 +647,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->max_ratio = 100; bdi->max_prop_frac = PROP_FRAC_BASE; spin_lock_init(&bdi->wb_lock); - INIT_RCU_HEAD(&bdi->rcu_head); INIT_LIST_HEAD(&bdi->bdi_list); - INIT_LIST_HEAD(&bdi->wb_list); INIT_LIST_HEAD(&bdi->work_list); bdi_wb_init(&bdi->wb, bdi); @@ -749,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; +static atomic_t nr_bdi_congested[2]; void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { @@ -756,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync) wait_queue_head_t *wqh = &congestion_wqh[sync]; bit = sync ? BDI_sync_congested : BDI_async_congested; - clear_bit(bit, &bdi->state); + if (test_and_clear_bit(bit, &bdi->state)) + atomic_dec(&nr_bdi_congested[sync]); smp_mb__after_clear_bit(); if (waitqueue_active(wqh)) wake_up(wqh); @@ -768,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync) enum bdi_state bit; bit = sync ? BDI_sync_congested : BDI_async_congested; - set_bit(bit, &bdi->state); + if (!test_and_set_bit(bit, &bdi->state)) + atomic_inc(&nr_bdi_congested[sync]); } EXPORT_SYMBOL(set_bdi_congested); @@ -784,13 +767,72 @@ EXPORT_SYMBOL(set_bdi_congested); long congestion_wait(int sync, long timeout) { long ret; + unsigned long start = jiffies; DEFINE_WAIT(wait); wait_queue_head_t *wqh = &congestion_wqh[sync]; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = io_schedule_timeout(timeout); finish_wait(wqh, &wait); + + trace_writeback_congestion_wait(jiffies_to_usecs(timeout), + jiffies_to_usecs(jiffies - start)); + return ret; } EXPORT_SYMBOL(congestion_wait); +/** + * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes + * @zone: A zone to check if it is heavily congested + * @sync: SYNC or ASYNC IO + * @timeout: timeout in jiffies + * + * In the event of a congested backing_dev (any backing_dev) and the given + * @zone has experienced recent congestion, this waits for up to @timeout + * jiffies for either a BDI to exit congestion of the given @sync queue + * or a write to complete. + * + * In the absense of zone congestion, cond_resched() is called to yield + * the processor if necessary but otherwise does not sleep. + * + * The return value is 0 if the sleep is for the full timeout. Otherwise, + * it is the number of jiffies that were still remaining when the function + * returned. return_value == timeout implies the function did not sleep. + */ +long wait_iff_congested(struct zone *zone, int sync, long timeout) +{ + long ret; + unsigned long start = jiffies; + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = &congestion_wqh[sync]; + + /* + * If there is no congestion, or heavy congestion is not being + * encountered in the current zone, yield if necessary instead + * of sleeping on the congestion queue + */ + if (atomic_read(&nr_bdi_congested[sync]) == 0 || + !zone_is_reclaim_congested(zone)) { + cond_resched(); + + /* In case we scheduled, work out time remaining */ + ret = timeout - (jiffies - start); + if (ret < 0) + ret = 0; + + goto out; + } + + /* Sleep until uncongested or a write happens */ + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + ret = io_schedule_timeout(timeout); + finish_wait(wqh, &wait); + +out: + trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout), + jiffies_to_usecs(jiffies - start)); + + return ret; +} +EXPORT_SYMBOL(wait_iff_congested); diff --git a/mm/bootmem.c b/mm/bootmem.c index 142c84a54993..13b0caa9793c 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/kmemleak.h> #include <linux/range.h> +#include <linux/memblock.h> #include <asm/bug.h> #include <asm/io.h> @@ -434,7 +435,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { #ifdef CONFIG_NO_BOOTMEM - free_early(physaddr, physaddr + size); + kmemleak_free_part(__va(physaddr), size); + memblock_x86_free_range(physaddr, physaddr + size); #else unsigned long start, end; @@ -459,7 +461,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, void __init free_bootmem(unsigned long addr, unsigned long size) { #ifdef CONFIG_NO_BOOTMEM - free_early(addr, addr + size); + kmemleak_free_part(__va(addr), size); + memblock_x86_free_range(addr, addr + size); #else unsigned long start, end; @@ -526,6 +529,12 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, } #ifndef CONFIG_NO_BOOTMEM +int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, + int flags) +{ + return reserve_bootmem(phys, len, flags); +} + static unsigned long __init align_idx(struct bootmem_data *bdata, unsigned long idx, unsigned long step) { diff --git a/mm/bounce.c b/mm/bounce.c index 13b6dad1eed2..1481de68184b 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -116,8 +116,8 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) */ vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; - flush_dcache_page(tovec->bv_page); bounce_copy_vec(tovec, vfrom); + flush_dcache_page(tovec->bv_page); } } diff --git a/mm/compaction.c b/mm/compaction.c index 94cce51b0b35..4d709ee59013 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -214,15 +214,16 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc) /* Similar to reclaim, but different enough that they don't share logic */ static bool too_many_isolated(struct zone *zone) { - - unsigned long inactive, isolated; + unsigned long active, inactive, isolated; inactive = zone_page_state(zone, NR_INACTIVE_FILE) + zone_page_state(zone, NR_INACTIVE_ANON); + active = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_ACTIVE_ANON); isolated = zone_page_state(zone, NR_ISOLATED_FILE) + zone_page_state(zone, NR_ISOLATED_ANON); - return isolated > inactive; + return isolated > (inactive + active) / 2; } /* diff --git a/mm/dmapool.c b/mm/dmapool.c index 3df063706f53..4df2de77e069 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -311,6 +311,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, size_t offset; void *retval; + might_sleep_if(mem_flags & __GFP_WAIT); + spin_lock_irqsave(&pool->lock, flags); restart: list_for_each_entry(page, &pool->page_list, page_list) { diff --git a/mm/filemap.c b/mm/filemap.c index 20e5642e9f9f..75572b5f2374 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -612,6 +612,19 @@ void __lock_page_nosync(struct page *page) TASK_UNINTERRUPTIBLE); } +int __lock_page_or_retry(struct page *page, struct mm_struct *mm, + unsigned int flags) +{ + if (!(flags & FAULT_FLAG_ALLOW_RETRY)) { + __lock_page(page); + return 1; + } else { + up_read(&mm->mmap_sem); + wait_on_page_locked(page); + return 0; + } +} + /** * find_get_page - find and get a page reference * @mapping: the address_space to search @@ -1539,25 +1552,28 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * waiting for the lock. */ do_async_mmap_readahead(vma, ra, file, page, offset); - lock_page(page); - - /* Did it get truncated? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto no_cached_page; - } } else { /* No page in the page cache at all */ do_sync_mmap_readahead(vma, ra, file, offset); count_vm_event(PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: - page = find_lock_page(mapping, offset); + page = find_get_page(mapping, offset); if (!page) goto no_cached_page; } + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) + return ret | VM_FAULT_RETRY; + + /* Did it get truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto retry_find; + } + VM_BUG_ON(page->index != offset); + /* * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. @@ -2177,12 +2193,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, } if (written > 0) { - loff_t end = pos + written; - if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { - i_size_write(inode, end); + pos += written; + if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { + i_size_write(inode, pos); mark_inode_dirty(inode); } - *ppos = end; + *ppos = pos; } out: return written; @@ -2238,14 +2254,12 @@ static ssize_t generic_perform_write(struct file *file, do { struct page *page; - pgoff_t index; /* Pagecache index for current page */ unsigned long offset; /* Offset into pagecache page */ unsigned long bytes; /* Bytes to write to page */ size_t copied; /* Bytes copied from user */ void *fsdata; offset = (pos & (PAGE_CACHE_SIZE - 1)); - index = pos >> PAGE_CACHE_SHIFT; bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, iov_iter_count(i)); diff --git a/mm/fremap.c b/mm/fremap.c index 46f5dacf90a2..ec520c7b28df 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -125,7 +125,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, { struct mm_struct *mm = current->mm; struct address_space *mapping; - unsigned long end = start + size; struct vm_area_struct *vma; int err = -EINVAL; int has_write_lock = 0; @@ -142,6 +141,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (start + size <= start) return err; + /* Does pgoff wrap? */ + if (pgoff + (size >> PAGE_SHIFT) < pgoff) + return err; + /* Can we represent this offset inside this architecture's pte's? */ #if PTE_FILE_MAX_BITS < BITS_PER_LONG if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) @@ -168,7 +171,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (!(vma->vm_flags & VM_CAN_NONLINEAR)) goto out; - if (end <= start || start < vma->vm_start || end > vma->vm_end) + if (start < vma->vm_start || start + size > vma->vm_end) goto out; /* Must set VM_NONLINEAR before any pages are populated. */ diff --git a/mm/highmem.c b/mm/highmem.c index 66baa20f78f5..693394daa2ed 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -26,8 +26,14 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> +#include <linux/kgdb.h> #include <asm/tlbflush.h> + +#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) +DEFINE_PER_CPU(int, __kmap_atomic_idx); +#endif + /* * Virtual_count is not a pure "count". * 0 means that it is not mapped, and has not been mapped @@ -41,6 +47,9 @@ unsigned long totalhigh_pages __read_mostly; EXPORT_SYMBOL(totalhigh_pages); + +EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); + unsigned int nr_free_highpages (void) { pg_data_t *pgdat; @@ -421,55 +430,3 @@ void __init page_address_init(void) } #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ - -#ifdef CONFIG_DEBUG_HIGHMEM - -void debug_kmap_atomic(enum km_type type) -{ - static int warn_count = 10; - - if (unlikely(warn_count < 0)) - return; - - if (unlikely(in_interrupt())) { - if (in_nmi()) { - if (type != KM_NMI && type != KM_NMI_PTE) { - WARN_ON(1); - warn_count--; - } - } else if (in_irq()) { - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && - type != KM_BOUNCE_READ && type != KM_IRQ_PTE) { - WARN_ON(1); - warn_count--; - } - } else if (!irqs_disabled()) { /* softirq */ - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && - type != KM_SKB_SUNRPC_DATA && - type != KM_SKB_DATA_SOFTIRQ && - type != KM_BOUNCE_READ) { - WARN_ON(1); - warn_count--; - } - } - } - - if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || - type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ || - type == KM_IRQ_PTE || type == KM_NMI || - type == KM_NMI_PTE ) { - if (!irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { - if (irq_count() == 0 && !irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } -} - -#endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 54d42b009dbe..c4a3558589ab 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -18,6 +18,9 @@ #include <linux/bootmem.h> #include <linux/sysfs.h> #include <linux/slab.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/swapops.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -220,6 +223,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } +pgoff_t linear_hugepage_index(struct vm_area_struct *vma, + unsigned long address) +{ + return vma_hugecache_offset(hstate_vma(vma), vma, address); +} + /* * Return the size of the pages allocated when backing a VMA. In the majority * cases this will be same size as used by the page table entries. @@ -414,14 +423,14 @@ static void clear_huge_page(struct page *page, } } -static void copy_gigantic_page(struct page *dst, struct page *src, +static void copy_user_gigantic_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma) { int i; struct hstate *h = hstate_vma(vma); struct page *dst_base = dst; struct page *src_base = src; - might_sleep(); + for (i = 0; i < pages_per_huge_page(h); ) { cond_resched(); copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); @@ -431,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src, src = mem_map_next(src, src_base, i); } } -static void copy_huge_page(struct page *dst, struct page *src, + +static void copy_user_huge_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma) { int i; struct hstate *h = hstate_vma(vma); if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { - copy_gigantic_page(dst, src, addr, vma); + copy_user_gigantic_page(dst, src, addr, vma); return; } @@ -449,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src, } } +static void copy_gigantic_page(struct page *dst, struct page *src) +{ + int i; + struct hstate *h = page_hstate(src); + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < pages_per_huge_page(h); ) { + cond_resched(); + copy_highpage(dst, src); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +void copy_huge_page(struct page *dst, struct page *src) +{ + int i; + struct hstate *h = page_hstate(src); + + if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { + copy_gigantic_page(dst, src); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page(h); i++) { + cond_resched(); + copy_highpage(dst + i, src + i); + } +} + static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); @@ -457,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) h->free_huge_pages_node[nid]++; } +static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +{ + struct page *page; + + if (list_empty(&h->hugepage_freelists[nid])) + return NULL; + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); + list_del(&page->lru); + set_page_refcounted(page); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + return page; +} + static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve) { - int nid; struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; @@ -487,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { - nid = zone_to_nid(zone); - if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && - !list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - - if (!avoid_reserve) - decrement_hugepage_resv_vma(h, vma); - - break; + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { + page = dequeue_huge_page_node(h, zone_to_nid(zone)); + if (page) { + if (!avoid_reserve) + decrement_hugepage_resv_vma(h, vma); + break; + } } } err: @@ -552,6 +603,7 @@ static void free_huge_page(struct page *page) set_page_private(page, 0); page->mapping = NULL; BUG_ON(page_count(page)); + BUG_ON(page_mapcount(page)); INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); @@ -605,6 +657,8 @@ int PageHuge(struct page *page) return dtor == free_huge_page; } +EXPORT_SYMBOL_GPL(PageHuge); + static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; @@ -758,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, return ret; } -static struct page *alloc_buddy_huge_page(struct hstate *h, - struct vm_area_struct *vma, unsigned long address) +static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) { struct page *page; - unsigned int nid; + unsigned int r_nid; if (h->order >= MAX_ORDER) return NULL; @@ -800,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, } spin_unlock(&hugetlb_lock); - page = alloc_pages(htlb_alloc_mask|__GFP_COMP| - __GFP_REPEAT|__GFP_NOWARN, - huge_page_order(h)); + if (nid == NUMA_NO_NODE) + page = alloc_pages(htlb_alloc_mask|__GFP_COMP| + __GFP_REPEAT|__GFP_NOWARN, + huge_page_order(h)); + else + page = alloc_pages_exact_node(nid, + htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page && arch_prepare_hugepage(page)) { __free_pages(page, huge_page_order(h)); @@ -811,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, spin_lock(&hugetlb_lock); if (page) { - /* - * This page is now managed by the hugetlb allocator and has - * no users -- drop the buddy allocator's reference. - */ - put_page_testzero(page); - VM_BUG_ON(page_count(page)); - nid = page_to_nid(page); + r_nid = page_to_nid(page); set_compound_page_dtor(page, free_huge_page); /* * We incremented the global counters already */ - h->nr_huge_pages_node[nid]++; - h->surplus_huge_pages_node[nid]++; + h->nr_huge_pages_node[r_nid]++; + h->surplus_huge_pages_node[r_nid]++; __count_vm_event(HTLB_BUDDY_PGALLOC); } else { h->nr_huge_pages--; @@ -836,6 +888,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, } /* + * This allocation function is useful in the context where vma is irrelevant. + * E.g. soft-offlining uses this function because it only cares physical + * address of error page. + */ +struct page *alloc_huge_page_node(struct hstate *h, int nid) +{ + struct page *page; + + spin_lock(&hugetlb_lock); + page = dequeue_huge_page_node(h, nid); + spin_unlock(&hugetlb_lock); + + if (!page) + page = alloc_buddy_huge_page(h, nid); + + return page; +} + +/* * Increase the hugetlb pool such that it can accomodate a reservation * of size 'delta'. */ @@ -859,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_buddy_huge_page(h, NULL, 0); - if (!page) { + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + if (!page) /* * We were not able to allocate enough pages to * satisfy the entire reservation so we free what * we've allocated so far. */ - spin_lock(&hugetlb_lock); - needed = 0; goto free; - } list_add(&page->lru, &surplus_list); } @@ -896,31 +964,31 @@ retry: needed += allocated; h->resv_huge_pages += delta; ret = 0; -free: + + spin_unlock(&hugetlb_lock); /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) break; list_del(&page->lru); + /* + * This page is now managed by the hugetlb allocator and has + * no users -- drop the buddy allocator's reference. + */ + put_page_testzero(page); + VM_BUG_ON(page_count(page)); enqueue_huge_page(h, page); } /* Free unnecessary surplus pages to the buddy allocator */ +free: if (!list_empty(&surplus_list)) { - spin_unlock(&hugetlb_lock); list_for_each_entry_safe(page, tmp, &surplus_list, lru) { list_del(&page->lru); - /* - * The page has a reference count of zero already, so - * call free_huge_page directly instead of using - * put_page. This must be done with hugetlb_lock - * unlocked which is safe because free_huge_page takes - * hugetlb_lock before deciding how to free the page. - */ - free_huge_page(page); + put_page(page); } - spin_lock(&hugetlb_lock); } + spin_lock(&hugetlb_lock); return ret; } @@ -1040,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, spin_unlock(&hugetlb_lock); if (!page) { - page = alloc_buddy_huge_page(h, vma, addr); + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { hugetlb_put_quota(inode->i_mapping, chg); return ERR_PTR(-VM_FAULT_SIGBUS); } } - set_page_refcounted(page); set_page_private(page, (unsigned long) mapping); vma_commit_reservation(h, vma, addr); @@ -2129,6 +2196,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); + page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); } spin_unlock(&src->page_table_lock); @@ -2140,6 +2208,32 @@ nomem: return -ENOMEM; } +static int is_hugetlb_entry_migration(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_migration_entry(swp)) { + return 1; + } else + return 0; +} + +static int is_hugetlb_entry_hwpoisoned(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { + return 1; + } else + return 0; +} + void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { @@ -2198,6 +2292,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, if (huge_pte_none(pte)) continue; + /* + * HWPoisoned hugepage is already unmapped and dropped reference + */ + if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) + continue; + page = pte_page(pte); if (pte_dirty(pte)) set_page_dirty(page); @@ -2207,6 +2307,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { + page_remove_rmap(page); list_del(&page->lru); put_page(page); } @@ -2272,6 +2373,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, return 1; } +/* + * Hugetlb_cow() should be called with page lock of the original hugepage held. + */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, struct page *pagecache_page) @@ -2286,8 +2390,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, retry_avoidcopy: /* If no-one else is actually using this page, avoid the copy * and just make the page writable */ - avoidcopy = (page_count(old_page) == 1); + avoidcopy = (page_mapcount(old_page) == 1); if (avoidcopy) { + if (PageAnon(old_page)) + page_move_anon_rmap(old_page, vma, address); set_huge_ptep_writable(vma, address, ptep); return 0; } @@ -2338,7 +2444,17 @@ retry_avoidcopy: return -PTR_ERR(new_page); } - copy_huge_page(new_page, old_page, address, vma); + /* + * When the original hugepage is shared one, it does not have + * anon_vma prepared. + */ + if (unlikely(anon_vma_prepare(vma))) { + /* Caller expects lock to be held */ + spin_lock(&mm->page_table_lock); + return VM_FAULT_OOM; + } + + copy_user_huge_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); /* @@ -2349,11 +2465,19 @@ retry_avoidcopy: ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { /* Break COW */ + mmu_notifier_invalidate_range_start(mm, + address & huge_page_mask(h), + (address & huge_page_mask(h)) + huge_page_size(h)); huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); + page_remove_rmap(old_page); + hugepage_add_new_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; + mmu_notifier_invalidate_range_end(mm, + address & huge_page_mask(h), + (address & huge_page_mask(h)) + huge_page_size(h)); } page_cache_release(new_page); page_cache_release(old_page); @@ -2452,10 +2576,27 @@ retry: spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); + page_dup_rmap(page); } else { lock_page(page); - page->mapping = HUGETLB_POISON; + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } + hugepage_add_new_anon_rmap(page, vma, address); } + } else { + /* + * If memory error occurs between mmap() and fault, some process + * don't have hwpoisoned swap entry for errored virtual address. + * So we need to block hugepage fault by PG_hwpoison bit check. + */ + if (unlikely(PageHWPoison(page))) { + ret = VM_FAULT_HWPOISON | + VM_FAULT_SET_HINDEX(h - hstates); + goto backout_unlocked; + } + page_dup_rmap(page); } /* @@ -2507,10 +2648,22 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + struct page *page = NULL; struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); + ptep = huge_pte_offset(mm, address); + if (ptep) { + entry = huge_ptep_get(ptep); + if (unlikely(is_hugetlb_entry_migration(entry))) { + migration_entry_wait(mm, (pmd_t *)ptep, address); + return 0; + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + return VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(h - hstates); + } + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); if (!ptep) return VM_FAULT_OOM; @@ -2548,6 +2701,17 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vma, address); } + /* + * hugetlb_cow() requires page locks of pte_page(entry) and + * pagecache_page, so here we need take the former one + * when page != pagecache_page or !pagecache_page. + * Note that locking order is always pagecache_page -> page, + * so no worry about deadlock. + */ + page = pte_page(entry); + if (page != pagecache_page) + lock_page(page); + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) @@ -2574,6 +2738,7 @@ out_page_table_lock: unlock_page(pagecache_page); put_page(pagecache_page); } + unlock_page(page); out_mutex: mutex_unlock(&hugetlb_instantiation_mutex); @@ -2785,3 +2950,42 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) hugetlb_put_quota(inode->i_mapping, (chg - freed)); hugetlb_acct_memory(h, -(chg - freed)); } + +#ifdef CONFIG_MEMORY_FAILURE + +/* Should be called in hugetlb_lock */ +static int is_hugepage_on_freelist(struct page *hpage) +{ + struct page *page; + struct page *tmp; + struct hstate *h = page_hstate(hpage); + int nid = page_to_nid(hpage); + + list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) + if (page == hpage) + return 1; + return 0; +} + +/* + * This function is called from memory failure code. + * Assume the caller holds page lock of the head page. + */ +int dequeue_hwpoisoned_huge_page(struct page *hpage) +{ + struct hstate *h = page_hstate(hpage); + int nid = page_to_nid(hpage); + int ret = -EBUSY; + + spin_lock(&hugetlb_lock); + if (is_hugepage_on_freelist(hpage)) { + list_del(&hpage->lru); + set_page_refcounted(hpage); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + ret = 0; + } + spin_unlock(&hugetlb_lock); + return ret; +} +#endif diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 10ea71905c1f..0948f1072d6b 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -5,6 +5,7 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/pagemap.h> +#include <linux/hugetlb.h> #include "internal.h" static struct dentry *hwpoison_dir; @@ -13,6 +14,7 @@ static int hwpoison_inject(void *data, u64 val) { unsigned long pfn = val; struct page *p; + struct page *hpage; int err; if (!capable(CAP_SYS_ADMIN)) @@ -24,18 +26,19 @@ static int hwpoison_inject(void *data, u64 val) return -ENXIO; p = pfn_to_page(pfn); + hpage = compound_head(p); /* * This implies unable to support free buddy pages. */ - if (!get_page_unless_zero(p)) + if (!get_page_unless_zero(hpage)) return 0; - if (!PageLRU(p)) + if (!PageLRU(p) && !PageHuge(p)) shake_page(p, 0); /* * This implies unable to support non-LRU pages. */ - if (!PageLRU(p)) + if (!PageLRU(p) && !PageHuge(p)) return 0; /* @@ -44,9 +47,9 @@ static int hwpoison_inject(void *data, u64 val) * We temporarily take page lock for try_get_mem_cgroup_from_page(). * __memory_failure() will redo the check reliably inside page lock. */ - lock_page(p); - err = hwpoison_filter(p); - unlock_page(p); + lock_page(hpage); + err = hwpoison_filter(hpage); + unlock_page(hpage); if (err) return 0; diff --git a/mm/init-mm.c b/mm/init-mm.c index 57aba0da9668..1d29cdfe8ebb 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -7,6 +7,11 @@ #include <asm/atomic.h> #include <asm/pgtable.h> +#include <asm/mmu.h> + +#ifndef INIT_MM_CONTEXT +#define INIT_MM_CONTEXT(name) +#endif struct mm_struct init_mm = { .mm_rb = RB_ROOT, @@ -17,4 +22,5 @@ struct mm_struct init_mm = { .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .cpu_vm_mask = CPU_MASK_ALL, + INIT_MM_CONTEXT(init_mm) }; diff --git a/mm/internal.h b/mm/internal.h index 6a697bb97fc5..dedb0aff673f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page); */ static inline unsigned long page_order(struct page *page) { - VM_BUG_ON(!PageBuddy(page)); + /* PageBuddy() must be checked by the caller */ return page_private(page); } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 2c0d032ac898..bd9bc214091b 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -211,6 +211,9 @@ static signed long jiffies_scan_wait; static int kmemleak_stack_scan = 1; /* protects the memory scanning, parameters and debug/kmemleak file access */ static DEFINE_MUTEX(scan_mutex); +/* setting kmemleak=on, will set this var, skipping the disable */ +static int kmemleak_skip_disable; + /* * Early object allocation/freeing logging. Kmemleak is initialized after the @@ -398,7 +401,9 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) object = prio_tree_entry(node, struct kmemleak_object, tree_node); if (!alias && object->pointer != ptr) { - kmemleak_warn("Found object by alias"); + pr_warning("Found object by alias at 0x%08lx\n", ptr); + dump_stack(); + dump_object_info(object); object = NULL; } } else @@ -695,7 +700,7 @@ static void paint_ptr(unsigned long ptr, int color) } /* - * Make a object permanently as gray-colored so that it can no longer be + * Mark an object permanently as gray-colored so that it can no longer be * reported as a leak. This is used in general to mark a false positive. */ static void make_gray_object(unsigned long ptr) @@ -838,10 +843,19 @@ out: rcu_read_unlock(); } -/* - * Memory allocation function callback. This function is called from the - * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, - * vmalloc etc.). +/** + * kmemleak_alloc - register a newly allocated object + * @ptr: pointer to beginning of the object + * @size: size of the object + * @min_count: minimum number of references to this object. If during memory + * scanning a number of references less than @min_count is found, + * the object is reported as a memory leak. If @min_count is 0, + * the object is never reported as a leak. If @min_count is -1, + * the object is ignored (not scanned and not reported as a leak) + * @gfp: kmalloc() flags used for kmemleak internal memory allocations + * + * This function is called from the kernel allocators when a new object + * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.). */ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) @@ -855,9 +869,12 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, } EXPORT_SYMBOL_GPL(kmemleak_alloc); -/* - * Memory freeing function callback. This function is called from the kernel - * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). +/** + * kmemleak_free - unregister a previously registered object + * @ptr: pointer to beginning of the object + * + * This function is called from the kernel allocators when an object (memory + * block) is freed (kmem_cache_free, kfree, vfree etc.). */ void __ref kmemleak_free(const void *ptr) { @@ -870,9 +887,14 @@ void __ref kmemleak_free(const void *ptr) } EXPORT_SYMBOL_GPL(kmemleak_free); -/* - * Partial memory freeing function callback. This function is usually called - * from bootmem allocator when (part of) a memory block is freed. +/** + * kmemleak_free_part - partially unregister a previously registered object + * @ptr: pointer to the beginning or inside the object. This also + * represents the start of the range to be freed + * @size: size to be unregistered + * + * This function is called when only a part of a memory block is freed + * (usually from the bootmem allocator). */ void __ref kmemleak_free_part(const void *ptr, size_t size) { @@ -885,9 +907,12 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) } EXPORT_SYMBOL_GPL(kmemleak_free_part); -/* - * Mark an already allocated memory block as a false positive. This will cause - * the block to no longer be reported as leak and always be scanned. +/** + * kmemleak_not_leak - mark an allocated object as false positive + * @ptr: pointer to beginning of the object + * + * Calling this function on an object will cause the memory block to no longer + * be reported as leak and always be scanned. */ void __ref kmemleak_not_leak(const void *ptr) { @@ -900,10 +925,14 @@ void __ref kmemleak_not_leak(const void *ptr) } EXPORT_SYMBOL(kmemleak_not_leak); -/* - * Ignore a memory block. This is usually done when it is known that the - * corresponding block is not a leak and does not contain any references to - * other allocated memory blocks. +/** + * kmemleak_ignore - ignore an allocated object + * @ptr: pointer to beginning of the object + * + * Calling this function on an object will cause the memory block to be + * ignored (not scanned and not reported as a leak). This is usually done when + * it is known that the corresponding block is not a leak and does not contain + * any references to other allocated memory blocks. */ void __ref kmemleak_ignore(const void *ptr) { @@ -916,8 +945,16 @@ void __ref kmemleak_ignore(const void *ptr) } EXPORT_SYMBOL(kmemleak_ignore); -/* - * Limit the range to be scanned in an allocated memory block. +/** + * kmemleak_scan_area - limit the range to be scanned in an allocated object + * @ptr: pointer to beginning or inside the object. This also + * represents the start of the scan area + * @size: size of the scan area + * @gfp: kmalloc() flags used for kmemleak internal memory allocations + * + * This function is used when it is known that only certain parts of an object + * contain references to other objects. Kmemleak will only scan these areas + * reducing the number false negatives. */ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) { @@ -930,8 +967,14 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) } EXPORT_SYMBOL(kmemleak_scan_area); -/* - * Inform kmemleak not to scan the given memory block. +/** + * kmemleak_no_scan - do not scan an allocated object + * @ptr: pointer to beginning of the object + * + * This function notifies kmemleak not to scan the given memory block. Useful + * in situations where it is known that the given object does not contain any + * references to other objects. Kmemleak will not scan such objects reducing + * the number of false negatives. */ void __ref kmemleak_no_scan(const void *ptr) { @@ -1602,7 +1645,9 @@ static int kmemleak_boot_config(char *str) return -EINVAL; if (strcmp(str, "off") == 0) kmemleak_disable(); - else if (strcmp(str, "on") != 0) + else if (strcmp(str, "on") == 0) + kmemleak_skip_disable = 1; + else return -EINVAL; return 0; } @@ -1616,6 +1661,13 @@ void __init kmemleak_init(void) int i; unsigned long flags; +#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF + if (!kmemleak_skip_disable) { + kmemleak_disable(); + return; + } +#endif + jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); @@ -33,6 +33,7 @@ #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/ksm.h> +#include <linux/hash.h> #include <asm/tlbflush.h> #include "internal.h" @@ -153,8 +154,9 @@ struct rmap_item { static struct rb_root root_stable_tree = RB_ROOT; static struct rb_root root_unstable_tree = RB_ROOT; -#define MM_SLOTS_HASH_HEADS 1024 -static struct hlist_head *mm_slots_hash; +#define MM_SLOTS_HASH_SHIFT 10 +#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT) +static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS]; static struct mm_slot ksm_mm_head = { .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), @@ -269,28 +271,13 @@ static inline void free_mm_slot(struct mm_slot *mm_slot) kmem_cache_free(mm_slot_cache, mm_slot); } -static int __init mm_slots_hash_init(void) -{ - mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), - GFP_KERNEL); - if (!mm_slots_hash) - return -ENOMEM; - return 0; -} - -static void __init mm_slots_hash_free(void) -{ - kfree(mm_slots_hash); -} - static struct mm_slot *get_mm_slot(struct mm_struct *mm) { struct mm_slot *mm_slot; struct hlist_head *bucket; struct hlist_node *node; - bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) - % MM_SLOTS_HASH_HEADS]; + bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; hlist_for_each_entry(mm_slot, node, bucket, link) { if (mm == mm_slot->mm) return mm_slot; @@ -303,8 +290,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, { struct hlist_head *bucket; - bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) - % MM_SLOTS_HASH_HEADS]; + bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; mm_slot->mm = mm; hlist_add_head(&mm_slot->link, bucket); } @@ -318,19 +304,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item, struct anon_vma *anon_vma) { rmap_item->anon_vma = anon_vma; - atomic_inc(&anon_vma->external_refcount); + get_anon_vma(anon_vma); } -static void drop_anon_vma(struct rmap_item *rmap_item) +static void ksm_drop_anon_vma(struct rmap_item *rmap_item) { struct anon_vma *anon_vma = rmap_item->anon_vma; - if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { - int empty = list_empty(&anon_vma->head); - spin_unlock(&anon_vma->lock); - if (empty) - anon_vma_free(anon_vma); - } + drop_anon_vma(anon_vma); } /* @@ -415,7 +396,7 @@ static void break_cow(struct rmap_item *rmap_item) * It is not an accident that whenever we want to break COW * to undo, we also need to drop a reference to the anon_vma. */ - drop_anon_vma(rmap_item); + ksm_drop_anon_vma(rmap_item); down_read(&mm->mmap_sem); if (ksm_test_exit(mm)) @@ -470,7 +451,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) ksm_pages_sharing--; else ksm_pages_shared--; - drop_anon_vma(rmap_item); + ksm_drop_anon_vma(rmap_item); rmap_item->address &= PAGE_MASK; cond_resched(); } @@ -558,7 +539,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) else ksm_pages_shared--; - drop_anon_vma(rmap_item); + ksm_drop_anon_vma(rmap_item); rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { @@ -731,7 +712,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (!ptep) goto out; - if (pte_write(*ptep)) { + if (pte_write(*ptep) || pte_dirty(*ptep)) { pte_t entry; swapped = PageSwapCache(page); @@ -754,7 +735,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, set_pte_at(mm, addr, ptep, entry); goto out_unlock; } - entry = pte_wrprotect(entry); + if (pte_dirty(entry)) + set_page_dirty(page); + entry = pte_mkclean(pte_wrprotect(entry)); set_pte_at_notify(mm, addr, ptep, entry); } *orig_pte = *ptep; @@ -1523,8 +1506,6 @@ struct page *ksm_does_need_to_copy(struct page *page, { struct page *new_page; - unlock_page(page); /* any racers will COW it, not modify it */ - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (new_page) { copy_user_highpage(new_page, page, address, vma); @@ -1540,7 +1521,6 @@ struct page *ksm_does_need_to_copy(struct page *page, add_page_to_unevictable_list(new_page); } - page_cache_release(page); return new_page; } @@ -1566,7 +1546,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - spin_lock(&anon_vma->lock); + anon_vma_lock(anon_vma); list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || @@ -1589,7 +1569,7 @@ again: if (!search_new_forks || !mapcount) break; } - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); if (!mapcount) goto out; } @@ -1619,7 +1599,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - spin_lock(&anon_vma->lock); + anon_vma_lock(anon_vma); list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || @@ -1637,11 +1617,11 @@ again: ret = try_to_unmap_one(page, vma, rmap_item->address, flags); if (ret != SWAP_AGAIN || !page_mapped(page)) { - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); goto out; } } - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); } if (!search_new_forks++) goto again; @@ -1671,7 +1651,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - spin_lock(&anon_vma->lock); + anon_vma_lock(anon_vma); list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || @@ -1688,11 +1668,11 @@ again: ret = rmap_one(page, vma, rmap_item->address, arg); if (ret != SWAP_AGAIN) { - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); goto out; } } - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); } if (!search_new_forks++) goto again; @@ -1943,15 +1923,11 @@ static int __init ksm_init(void) if (err) goto out; - err = mm_slots_hash_init(); - if (err) - goto out_free1; - ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); if (IS_ERR(ksm_thread)) { printk(KERN_ERR "ksm: creating kthread failed\n"); err = PTR_ERR(ksm_thread); - goto out_free2; + goto out_free; } #ifdef CONFIG_SYSFS @@ -1959,7 +1935,7 @@ static int __init ksm_init(void) if (err) { printk(KERN_ERR "ksm: register sysfs failed\n"); kthread_stop(ksm_thread); - goto out_free2; + goto out_free; } #else ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ @@ -1975,9 +1951,7 @@ static int __init ksm_init(void) #endif return 0; -out_free2: - mm_slots_hash_free(); -out_free1: +out_free: ksm_slab_free(); out: return err; diff --git a/mm/maccess.c b/mm/maccess.c index 4e348dbaecd7..e2b6f5634e0d 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -1,9 +1,9 @@ /* * Access kernel memory without faulting. */ -#include <linux/uaccess.h> #include <linux/module.h> #include <linux/mm.h> +#include <linux/uaccess.h> /** * probe_kernel_read(): safely attempt to read from a location diff --git a/mm/memblock.c b/mm/memblock.c index 3024eb30fc27..400dc62697d7 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -11,237 +11,423 @@ */ #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/bitops.h> +#include <linux/poison.h> +#include <linux/pfn.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> #include <linux/memblock.h> -#define MEMBLOCK_ALLOC_ANYWHERE 0 +struct memblock memblock __initdata_memblock; -struct memblock memblock; +int memblock_debug __initdata_memblock; +int memblock_can_resize __initdata_memblock; +static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock; +static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock; -static int memblock_debug; +/* inline so we don't get a warning when pr_debug is compiled out */ +static inline const char *memblock_type_name(struct memblock_type *type) +{ + if (type == &memblock.memory) + return "memory"; + else if (type == &memblock.reserved) + return "reserved"; + else + return "unknown"; +} -static int __init early_memblock(char *p) +/* + * Address comparison utilities + */ + +static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size) { - if (p && strstr(p, "debug")) - memblock_debug = 1; + return addr & ~(size - 1); +} + +static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size) +{ + return (addr + (size - 1)) & ~(size - 1); +} + +static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, + phys_addr_t base2, phys_addr_t size2) +{ + return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); +} + +static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1, + phys_addr_t base2, phys_addr_t size2) +{ + if (base2 == base1 + size1) + return 1; + else if (base1 == base2 + size2) + return -1; + return 0; } -early_param("memblock", early_memblock); -static void memblock_dump(struct memblock_region *region, char *name) +static long __init_memblock memblock_regions_adjacent(struct memblock_type *type, + unsigned long r1, unsigned long r2) { - unsigned long long base, size; - int i; + phys_addr_t base1 = type->regions[r1].base; + phys_addr_t size1 = type->regions[r1].size; + phys_addr_t base2 = type->regions[r2].base; + phys_addr_t size2 = type->regions[r2].size; - pr_info(" %s.cnt = 0x%lx\n", name, region->cnt); + return memblock_addrs_adjacent(base1, size1, base2, size2); +} - for (i = 0; i < region->cnt; i++) { - base = region->region[i].base; - size = region->region[i].size; +long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) +{ + unsigned long i; - pr_info(" %s[0x%x]\t0x%016llx - 0x%016llx, 0x%llx bytes\n", - name, i, base, base + size - 1, size); + for (i = 0; i < type->cnt; i++) { + phys_addr_t rgnbase = type->regions[i].base; + phys_addr_t rgnsize = type->regions[i].size; + if (memblock_addrs_overlap(base, size, rgnbase, rgnsize)) + break; } + + return (i < type->cnt) ? i : -1; } -void memblock_dump_all(void) +/* + * Find, allocate, deallocate or reserve unreserved regions. All allocations + * are top-down. + */ + +static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end, + phys_addr_t size, phys_addr_t align) { - if (!memblock_debug) - return; + phys_addr_t base, res_base; + long j; - pr_info("MEMBLOCK configuration:\n"); - pr_info(" rmo_size = 0x%llx\n", (unsigned long long)memblock.rmo_size); - pr_info(" memory.size = 0x%llx\n", (unsigned long long)memblock.memory.size); + /* In case, huge size is requested */ + if (end < size) + return MEMBLOCK_ERROR; - memblock_dump(&memblock.memory, "memory"); - memblock_dump(&memblock.reserved, "reserved"); + base = memblock_align_down((end - size), align); + + /* Prevent allocations returning 0 as it's also used to + * indicate an allocation failure + */ + if (start == 0) + start = PAGE_SIZE; + + while (start <= base) { + j = memblock_overlaps_region(&memblock.reserved, base, size); + if (j < 0) + return base; + res_base = memblock.reserved.regions[j].base; + if (res_base < size) + break; + base = memblock_align_down(res_base - size, align); + } + + return MEMBLOCK_ERROR; } -static unsigned long memblock_addrs_overlap(u64 base1, u64 size1, u64 base2, - u64 size2) +static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size, + phys_addr_t align, phys_addr_t start, phys_addr_t end) { - return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); + long i; + + BUG_ON(0 == size); + + size = memblock_align_up(size, align); + + /* Pump up max_addr */ + if (end == MEMBLOCK_ALLOC_ACCESSIBLE) + end = memblock.current_limit; + + /* We do a top-down search, this tends to limit memory + * fragmentation by keeping early boot allocs near the + * top of memory + */ + for (i = memblock.memory.cnt - 1; i >= 0; i--) { + phys_addr_t memblockbase = memblock.memory.regions[i].base; + phys_addr_t memblocksize = memblock.memory.regions[i].size; + phys_addr_t bottom, top, found; + + if (memblocksize < size) + continue; + if ((memblockbase + memblocksize) <= start) + break; + bottom = max(memblockbase, start); + top = min(memblockbase + memblocksize, end); + if (bottom >= top) + continue; + found = memblock_find_region(bottom, top, size, align); + if (found != MEMBLOCK_ERROR) + return found; + } + return MEMBLOCK_ERROR; } -static long memblock_addrs_adjacent(u64 base1, u64 size1, u64 base2, u64 size2) +/* + * Find a free area with specified alignment in a specific range. + */ +u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align) { - if (base2 == base1 + size1) - return 1; - else if (base1 == base2 + size2) - return -1; + return memblock_find_base(size, align, start, end); +} - return 0; +/* + * Free memblock.reserved.regions + */ +int __init_memblock memblock_free_reserved_regions(void) +{ + if (memblock.reserved.regions == memblock_reserved_init_regions) + return 0; + + return memblock_free(__pa(memblock.reserved.regions), + sizeof(struct memblock_region) * memblock.reserved.max); } -static long memblock_regions_adjacent(struct memblock_region *rgn, - unsigned long r1, unsigned long r2) +/* + * Reserve memblock.reserved.regions + */ +int __init_memblock memblock_reserve_reserved_regions(void) { - u64 base1 = rgn->region[r1].base; - u64 size1 = rgn->region[r1].size; - u64 base2 = rgn->region[r2].base; - u64 size2 = rgn->region[r2].size; + if (memblock.reserved.regions == memblock_reserved_init_regions) + return 0; - return memblock_addrs_adjacent(base1, size1, base2, size2); + return memblock_reserve(__pa(memblock.reserved.regions), + sizeof(struct memblock_region) * memblock.reserved.max); } -static void memblock_remove_region(struct memblock_region *rgn, unsigned long r) +static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) { unsigned long i; - for (i = r; i < rgn->cnt - 1; i++) { - rgn->region[i].base = rgn->region[i + 1].base; - rgn->region[i].size = rgn->region[i + 1].size; + for (i = r; i < type->cnt - 1; i++) { + type->regions[i].base = type->regions[i + 1].base; + type->regions[i].size = type->regions[i + 1].size; } - rgn->cnt--; + type->cnt--; } /* Assumption: base addr of region 1 < base addr of region 2 */ -static void memblock_coalesce_regions(struct memblock_region *rgn, +static void __init_memblock memblock_coalesce_regions(struct memblock_type *type, unsigned long r1, unsigned long r2) { - rgn->region[r1].size += rgn->region[r2].size; - memblock_remove_region(rgn, r2); + type->regions[r1].size += type->regions[r2].size; + memblock_remove_region(type, r2); } -void __init memblock_init(void) +/* Defined below but needed now */ +static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size); + +static int __init_memblock memblock_double_array(struct memblock_type *type) { - /* Create a dummy zero size MEMBLOCK which will get coalesced away later. - * This simplifies the memblock_add() code below... + struct memblock_region *new_array, *old_array; + phys_addr_t old_size, new_size, addr; + int use_slab = slab_is_available(); + + /* We don't allow resizing until we know about the reserved regions + * of memory that aren't suitable for allocation */ - memblock.memory.region[0].base = 0; - memblock.memory.region[0].size = 0; - memblock.memory.cnt = 1; + if (!memblock_can_resize) + return -1; - /* Ditto. */ - memblock.reserved.region[0].base = 0; - memblock.reserved.region[0].size = 0; - memblock.reserved.cnt = 1; -} + /* Calculate new doubled size */ + old_size = type->max * sizeof(struct memblock_region); + new_size = old_size << 1; + + /* Try to find some space for it. + * + * WARNING: We assume that either slab_is_available() and we use it or + * we use MEMBLOCK for allocations. That means that this is unsafe to use + * when bootmem is currently active (unless bootmem itself is implemented + * on top of MEMBLOCK which isn't the case yet) + * + * This should however not be an issue for now, as we currently only + * call into MEMBLOCK while it's still active, or much later when slab is + * active for memory hotplug operations + */ + if (use_slab) { + new_array = kmalloc(new_size, GFP_KERNEL); + addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array); + } else + addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE); + if (addr == MEMBLOCK_ERROR) { + pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", + memblock_type_name(type), type->max, type->max * 2); + return -1; + } + new_array = __va(addr); -void __init memblock_analyze(void) -{ - int i; + memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", + memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); + + /* Found space, we now need to move the array over before + * we add the reserved region since it may be our reserved + * array itself that is full. + */ + memcpy(new_array, type->regions, old_size); + memset(new_array + type->max, 0, old_size); + old_array = type->regions; + type->regions = new_array; + type->max <<= 1; + + /* If we use SLAB that's it, we are done */ + if (use_slab) + return 0; - memblock.memory.size = 0; + /* Add the new reserved region now. Should not fail ! */ + BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0); - for (i = 0; i < memblock.memory.cnt; i++) - memblock.memory.size += memblock.memory.region[i].size; + /* If the array wasn't our static init one, then free it. We only do + * that before SLAB is available as later on, we don't know whether + * to use kfree or free_bootmem_pages(). Shouldn't be a big deal + * anyways + */ + if (old_array != memblock_memory_init_regions && + old_array != memblock_reserved_init_regions) + memblock_free(__pa(old_array), old_size); + + return 0; +} + +extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, + phys_addr_t addr2, phys_addr_t size2) +{ + return 1; } -static long memblock_add_region(struct memblock_region *rgn, u64 base, u64 size) +static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) { unsigned long coalesced = 0; long adjacent, i; - if ((rgn->cnt == 1) && (rgn->region[0].size == 0)) { - rgn->region[0].base = base; - rgn->region[0].size = size; + if ((type->cnt == 1) && (type->regions[0].size == 0)) { + type->regions[0].base = base; + type->regions[0].size = size; return 0; } /* First try and coalesce this MEMBLOCK with another. */ - for (i = 0; i < rgn->cnt; i++) { - u64 rgnbase = rgn->region[i].base; - u64 rgnsize = rgn->region[i].size; + for (i = 0; i < type->cnt; i++) { + phys_addr_t rgnbase = type->regions[i].base; + phys_addr_t rgnsize = type->regions[i].size; if ((rgnbase == base) && (rgnsize == size)) /* Already have this region, so we're done */ return 0; adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize); + /* Check if arch allows coalescing */ + if (adjacent != 0 && type == &memblock.memory && + !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize)) + break; if (adjacent > 0) { - rgn->region[i].base -= size; - rgn->region[i].size += size; + type->regions[i].base -= size; + type->regions[i].size += size; coalesced++; break; } else if (adjacent < 0) { - rgn->region[i].size += size; + type->regions[i].size += size; coalesced++; break; } } - if ((i < rgn->cnt - 1) && memblock_regions_adjacent(rgn, i, i+1)) { - memblock_coalesce_regions(rgn, i, i+1); + /* If we plugged a hole, we may want to also coalesce with the + * next region + */ + if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) && + ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base, + type->regions[i].size, + type->regions[i+1].base, + type->regions[i+1].size)))) { + memblock_coalesce_regions(type, i, i+1); coalesced++; } if (coalesced) return coalesced; - if (rgn->cnt >= MAX_MEMBLOCK_REGIONS) + + /* If we are out of space, we fail. It's too late to resize the array + * but then this shouldn't have happened in the first place. + */ + if (WARN_ON(type->cnt >= type->max)) return -1; /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */ - for (i = rgn->cnt - 1; i >= 0; i--) { - if (base < rgn->region[i].base) { - rgn->region[i+1].base = rgn->region[i].base; - rgn->region[i+1].size = rgn->region[i].size; + for (i = type->cnt - 1; i >= 0; i--) { + if (base < type->regions[i].base) { + type->regions[i+1].base = type->regions[i].base; + type->regions[i+1].size = type->regions[i].size; } else { - rgn->region[i+1].base = base; - rgn->region[i+1].size = size; + type->regions[i+1].base = base; + type->regions[i+1].size = size; break; } } - if (base < rgn->region[0].base) { - rgn->region[0].base = base; - rgn->region[0].size = size; + if (base < type->regions[0].base) { + type->regions[0].base = base; + type->regions[0].size = size; + } + type->cnt++; + + /* The array is full ? Try to resize it. If that fails, we undo + * our allocation and return an error + */ + if (type->cnt == type->max && memblock_double_array(type)) { + type->cnt--; + return -1; } - rgn->cnt++; return 0; } -long memblock_add(u64 base, u64 size) +long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { - struct memblock_region *_rgn = &memblock.memory; - - /* On pSeries LPAR systems, the first MEMBLOCK is our RMO region. */ - if (base == 0) - memblock.rmo_size = size; - - return memblock_add_region(_rgn, base, size); + return memblock_add_region(&memblock.memory, base, size); } -static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size) +static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size) { - u64 rgnbegin, rgnend; - u64 end = base + size; + phys_addr_t rgnbegin, rgnend; + phys_addr_t end = base + size; int i; rgnbegin = rgnend = 0; /* supress gcc warnings */ /* Find the region where (base, size) belongs to */ - for (i=0; i < rgn->cnt; i++) { - rgnbegin = rgn->region[i].base; - rgnend = rgnbegin + rgn->region[i].size; + for (i=0; i < type->cnt; i++) { + rgnbegin = type->regions[i].base; + rgnend = rgnbegin + type->regions[i].size; if ((rgnbegin <= base) && (end <= rgnend)) break; } /* Didn't find the region */ - if (i == rgn->cnt) + if (i == type->cnt) return -1; /* Check to see if we are removing entire region */ if ((rgnbegin == base) && (rgnend == end)) { - memblock_remove_region(rgn, i); + memblock_remove_region(type, i); return 0; } /* Check to see if region is matching at the front */ if (rgnbegin == base) { - rgn->region[i].base = end; - rgn->region[i].size -= size; + type->regions[i].base = end; + type->regions[i].size -= size; return 0; } /* Check to see if the region is matching at the end */ if (rgnend == end) { - rgn->region[i].size -= size; + type->regions[i].size -= size; return 0; } @@ -249,208 +435,189 @@ static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size) * We need to split the entry - adjust the current one to the * beginging of the hole and add the region after hole. */ - rgn->region[i].size = base - rgn->region[i].base; - return memblock_add_region(rgn, end, rgnend - end); + type->regions[i].size = base - type->regions[i].base; + return memblock_add_region(type, end, rgnend - end); } -long memblock_remove(u64 base, u64 size) +long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) { return __memblock_remove(&memblock.memory, base, size); } -long __init memblock_free(u64 base, u64 size) +long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { return __memblock_remove(&memblock.reserved, base, size); } -long __init memblock_reserve(u64 base, u64 size) +long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) { - struct memblock_region *_rgn = &memblock.reserved; + struct memblock_type *_rgn = &memblock.reserved; BUG_ON(0 == size); return memblock_add_region(_rgn, base, size); } -long memblock_overlaps_region(struct memblock_region *rgn, u64 base, u64 size) +phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) { - unsigned long i; + phys_addr_t found; - for (i = 0; i < rgn->cnt; i++) { - u64 rgnbase = rgn->region[i].base; - u64 rgnsize = rgn->region[i].size; - if (memblock_addrs_overlap(base, size, rgnbase, rgnsize)) - break; - } + /* We align the size to limit fragmentation. Without this, a lot of + * small allocs quickly eat up the whole reserve array on sparc + */ + size = memblock_align_up(size, align); + + found = memblock_find_base(size, align, 0, max_addr); + if (found != MEMBLOCK_ERROR && + memblock_add_region(&memblock.reserved, found, size) >= 0) + return found; - return (i < rgn->cnt) ? i : -1; + return 0; } -static u64 memblock_align_down(u64 addr, u64 size) +phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) { - return addr & ~(size - 1); + phys_addr_t alloc; + + alloc = __memblock_alloc_base(size, align, max_addr); + + if (alloc == 0) + panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", + (unsigned long long) size, (unsigned long long) max_addr); + + return alloc; } -static u64 memblock_align_up(u64 addr, u64 size) +phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align) { - return (addr + (size - 1)) & ~(size - 1); + return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } -static u64 __init memblock_alloc_nid_unreserved(u64 start, u64 end, - u64 size, u64 align) + +/* + * Additional node-local allocators. Search for node memory is bottom up + * and walks memblock regions within that node bottom-up as well, but allocation + * within an memblock region is top-down. XXX I plan to fix that at some stage + * + * WARNING: Only available after early_node_map[] has been populated, + * on some architectures, that is after all the calls to add_active_range() + * have been done to populate it. + */ + +phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid) { - u64 base, res_base; - long j; +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP + /* + * This code originates from sparc which really wants use to walk by addresses + * and returns the nid. This is not very convenient for early_pfn_map[] users + * as the map isn't sorted yet, and it really wants to be walked by nid. + * + * For now, I implement the inefficient method below which walks the early + * map multiple times. Eventually we may want to use an ARCH config option + * to implement a completely different method for both case. + */ + unsigned long start_pfn, end_pfn; + int i; - base = memblock_align_down((end - size), align); - while (start <= base) { - j = memblock_overlaps_region(&memblock.reserved, base, size); - if (j < 0) { - /* this area isn't reserved, take it */ - if (memblock_add_region(&memblock.reserved, base, size) < 0) - base = ~(u64)0; - return base; - } - res_base = memblock.reserved.region[j].base; - if (res_base < size) - break; - base = memblock_align_down(res_base - size, align); + for (i = 0; i < MAX_NUMNODES; i++) { + get_pfn_range_for_nid(i, &start_pfn, &end_pfn); + if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn)) + continue; + *nid = i; + return min(end, PFN_PHYS(end_pfn)); } +#endif + *nid = 0; - return ~(u64)0; + return end; } -static u64 __init memblock_alloc_nid_region(struct memblock_property *mp, - u64 (*nid_range)(u64, u64, int *), - u64 size, u64 align, int nid) +static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp, + phys_addr_t size, + phys_addr_t align, int nid) { - u64 start, end; + phys_addr_t start, end; start = mp->base; end = start + mp->size; start = memblock_align_up(start, align); while (start < end) { - u64 this_end; + phys_addr_t this_end; int this_nid; - this_end = nid_range(start, end, &this_nid); + this_end = memblock_nid_range(start, end, &this_nid); if (this_nid == nid) { - u64 ret = memblock_alloc_nid_unreserved(start, this_end, - size, align); - if (ret != ~(u64)0) + phys_addr_t ret = memblock_find_region(start, this_end, size, align); + if (ret != MEMBLOCK_ERROR && + memblock_add_region(&memblock.reserved, ret, size) >= 0) return ret; } start = this_end; } - return ~(u64)0; + return MEMBLOCK_ERROR; } -u64 __init memblock_alloc_nid(u64 size, u64 align, int nid, - u64 (*nid_range)(u64 start, u64 end, int *nid)) +phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { - struct memblock_region *mem = &memblock.memory; + struct memblock_type *mem = &memblock.memory; int i; BUG_ON(0 == size); + /* We align the size to limit fragmentation. Without this, a lot of + * small allocs quickly eat up the whole reserve array on sparc + */ size = memblock_align_up(size, align); + /* We do a bottom-up search for a region with the right + * nid since that's easier considering how memblock_nid_range() + * works + */ for (i = 0; i < mem->cnt; i++) { - u64 ret = memblock_alloc_nid_region(&mem->region[i], - nid_range, + phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i], size, align, nid); - if (ret != ~(u64)0) + if (ret != MEMBLOCK_ERROR) return ret; } - return memblock_alloc(size, align); -} - -u64 __init memblock_alloc(u64 size, u64 align) -{ - return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE); + return 0; } -u64 __init memblock_alloc_base(u64 size, u64 align, u64 max_addr) +phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) { - u64 alloc; + phys_addr_t res = memblock_alloc_nid(size, align, nid); - alloc = __memblock_alloc_base(size, align, max_addr); - - if (alloc == 0) - panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", - (unsigned long long) size, (unsigned long long) max_addr); - - return alloc; + if (res) + return res; + return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE); } -u64 __init __memblock_alloc_base(u64 size, u64 align, u64 max_addr) -{ - long i, j; - u64 base = 0; - u64 res_base; - - BUG_ON(0 == size); - - size = memblock_align_up(size, align); - - /* On some platforms, make sure we allocate lowmem */ - /* Note that MEMBLOCK_REAL_LIMIT may be MEMBLOCK_ALLOC_ANYWHERE */ - if (max_addr == MEMBLOCK_ALLOC_ANYWHERE) - max_addr = MEMBLOCK_REAL_LIMIT; - - for (i = memblock.memory.cnt - 1; i >= 0; i--) { - u64 memblockbase = memblock.memory.region[i].base; - u64 memblocksize = memblock.memory.region[i].size; - - if (memblocksize < size) - continue; - if (max_addr == MEMBLOCK_ALLOC_ANYWHERE) - base = memblock_align_down(memblockbase + memblocksize - size, align); - else if (memblockbase < max_addr) { - base = min(memblockbase + memblocksize, max_addr); - base = memblock_align_down(base - size, align); - } else - continue; - while (base && memblockbase <= base) { - j = memblock_overlaps_region(&memblock.reserved, base, size); - if (j < 0) { - /* this area isn't reserved, take it */ - if (memblock_add_region(&memblock.reserved, base, size) < 0) - return 0; - return base; - } - res_base = memblock.reserved.region[j].base; - if (res_base < size) - break; - base = memblock_align_down(res_base - size, align); - } - } - return 0; -} +/* + * Remaining API functions + */ /* You must call memblock_analyze() before this. */ -u64 __init memblock_phys_mem_size(void) +phys_addr_t __init memblock_phys_mem_size(void) { - return memblock.memory.size; + return memblock.memory_size; } -u64 memblock_end_of_DRAM(void) +phys_addr_t __init_memblock memblock_end_of_DRAM(void) { int idx = memblock.memory.cnt - 1; - return (memblock.memory.region[idx].base + memblock.memory.region[idx].size); + return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); } /* You must call memblock_analyze() after this. */ -void __init memblock_enforce_memory_limit(u64 memory_limit) +void __init memblock_enforce_memory_limit(phys_addr_t memory_limit) { unsigned long i; - u64 limit; - struct memblock_property *p; + phys_addr_t limit; + struct memblock_region *p; if (!memory_limit) return; @@ -458,24 +625,21 @@ void __init memblock_enforce_memory_limit(u64 memory_limit) /* Truncate the memblock regions to satisfy the memory limit. */ limit = memory_limit; for (i = 0; i < memblock.memory.cnt; i++) { - if (limit > memblock.memory.region[i].size) { - limit -= memblock.memory.region[i].size; + if (limit > memblock.memory.regions[i].size) { + limit -= memblock.memory.regions[i].size; continue; } - memblock.memory.region[i].size = limit; + memblock.memory.regions[i].size = limit; memblock.memory.cnt = i + 1; break; } - if (memblock.memory.region[0].size < memblock.rmo_size) - memblock.rmo_size = memblock.memory.region[0].size; - memory_limit = memblock_end_of_DRAM(); /* And truncate any reserves above the limit also. */ for (i = 0; i < memblock.reserved.cnt; i++) { - p = &memblock.reserved.region[i]; + p = &memblock.reserved.regions[i]; if (p->base > memory_limit) p->size = 0; @@ -489,53 +653,190 @@ void __init memblock_enforce_memory_limit(u64 memory_limit) } } -int __init memblock_is_reserved(u64 addr) +static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) +{ + unsigned int left = 0, right = type->cnt; + + do { + unsigned int mid = (right + left) / 2; + + if (addr < type->regions[mid].base) + right = mid; + else if (addr >= (type->regions[mid].base + + type->regions[mid].size)) + left = mid + 1; + else + return mid; + } while (left < right); + return -1; +} + +int __init memblock_is_reserved(phys_addr_t addr) +{ + return memblock_search(&memblock.reserved, addr) != -1; +} + +int __init_memblock memblock_is_memory(phys_addr_t addr) +{ + return memblock_search(&memblock.memory, addr) != -1; +} + +int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) +{ + int idx = memblock_search(&memblock.reserved, base); + + if (idx == -1) + return 0; + return memblock.reserved.regions[idx].base <= base && + (memblock.reserved.regions[idx].base + + memblock.reserved.regions[idx].size) >= (base + size); +} + +int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) +{ + return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; +} + + +void __init_memblock memblock_set_current_limit(phys_addr_t limit) { + memblock.current_limit = limit; +} + +static void __init_memblock memblock_dump(struct memblock_type *region, char *name) +{ + unsigned long long base, size; int i; - for (i = 0; i < memblock.reserved.cnt; i++) { - u64 upper = memblock.reserved.region[i].base + - memblock.reserved.region[i].size - 1; - if ((addr >= memblock.reserved.region[i].base) && (addr <= upper)) - return 1; + pr_info(" %s.cnt = 0x%lx\n", name, region->cnt); + + for (i = 0; i < region->cnt; i++) { + base = region->regions[i].base; + size = region->regions[i].size; + + pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n", + name, i, base, base + size - 1, size); } - return 0; } -int memblock_is_region_reserved(u64 base, u64 size) +void __init_memblock memblock_dump_all(void) { - return memblock_overlaps_region(&memblock.reserved, base, size); + if (!memblock_debug) + return; + + pr_info("MEMBLOCK configuration:\n"); + pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size); + + memblock_dump(&memblock.memory, "memory"); + memblock_dump(&memblock.reserved, "reserved"); } -/* - * Given a <base, len>, find which memory regions belong to this range. - * Adjust the request and return a contiguous chunk. - */ -int memblock_find(struct memblock_property *res) +void __init memblock_analyze(void) { int i; - u64 rstart, rend; - rstart = res->base; - rend = rstart + res->size - 1; + /* Check marker in the unused last array entry */ + WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base + != (phys_addr_t)RED_INACTIVE); + WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base + != (phys_addr_t)RED_INACTIVE); + + memblock.memory_size = 0; + + for (i = 0; i < memblock.memory.cnt; i++) + memblock.memory_size += memblock.memory.regions[i].size; + + /* We allow resizing from there */ + memblock_can_resize = 1; +} + +void __init memblock_init(void) +{ + static int init_done __initdata = 0; + + if (init_done) + return; + init_done = 1; + + /* Hookup the initial arrays */ + memblock.memory.regions = memblock_memory_init_regions; + memblock.memory.max = INIT_MEMBLOCK_REGIONS; + memblock.reserved.regions = memblock_reserved_init_regions; + memblock.reserved.max = INIT_MEMBLOCK_REGIONS; + + /* Write a marker in the unused last array entry */ + memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; + memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; + + /* Create a dummy zero size MEMBLOCK which will get coalesced away later. + * This simplifies the memblock_add() code below... + */ + memblock.memory.regions[0].base = 0; + memblock.memory.regions[0].size = 0; + memblock.memory.cnt = 1; + + /* Ditto. */ + memblock.reserved.regions[0].base = 0; + memblock.reserved.regions[0].size = 0; + memblock.reserved.cnt = 1; + + memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE; +} + +static int __init early_memblock(char *p) +{ + if (p && strstr(p, "debug")) + memblock_debug = 1; + return 0; +} +early_param("memblock", early_memblock); + +#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK) + +static int memblock_debug_show(struct seq_file *m, void *private) +{ + struct memblock_type *type = m->private; + struct memblock_region *reg; + int i; + + for (i = 0; i < type->cnt; i++) { + reg = &type->regions[i]; + seq_printf(m, "%4d: ", i); + if (sizeof(phys_addr_t) == 4) + seq_printf(m, "0x%08lx..0x%08lx\n", + (unsigned long)reg->base, + (unsigned long)(reg->base + reg->size - 1)); + else + seq_printf(m, "0x%016llx..0x%016llx\n", + (unsigned long long)reg->base, + (unsigned long long)(reg->base + reg->size - 1)); - for (i = 0; i < memblock.memory.cnt; i++) { - u64 start = memblock.memory.region[i].base; - u64 end = start + memblock.memory.region[i].size - 1; - - if (start > rend) - return -1; - - if ((end >= rstart) && (start < rend)) { - /* adjust the request */ - if (rstart < start) - rstart = start; - if (rend > end) - rend = end; - res->base = rstart; - res->size = rend - rstart + 1; - return 0; - } } - return -1; + return 0; +} + +static int memblock_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, memblock_debug_show, inode->i_private); } + +static const struct file_operations memblock_debug_fops = { + .open = memblock_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init memblock_init_debugfs(void) +{ + struct dentry *root = debugfs_create_dir("memblock", NULL); + if (!root) + return -ENXIO; + debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops); + debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops); + + return 0; +} +__initcall(memblock_init_debugfs); + +#endif /* CONFIG_DEBUG_FS */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 20a8193a7af8..9a99cfaf0a19 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -47,10 +47,13 @@ #include <linux/mm_inline.h> #include <linux/page_cgroup.h> #include <linux/cpu.h> +#include <linux/oom.h> #include "internal.h" #include <asm/uaccess.h> +#include <trace/events/vmscan.h> + struct cgroup_subsys mem_cgroup_subsys __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 struct mem_cgroup *root_mem_cgroup __read_mostly; @@ -86,7 +89,10 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ - MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ + MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ + /* incremented at every pagein/pageout */ + MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA, + MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ MEM_CGROUP_STAT_NSTATS, }; @@ -211,8 +217,6 @@ struct mem_cgroup { */ spinlock_t reclaim_param_lock; - int prev_priority; /* for recording reclaim priority */ - /* * While reclaiming in a hierarchy, we cache the last child we * reclaimed from. @@ -253,6 +257,12 @@ struct mem_cgroup { * percpu counter. */ struct mem_cgroup_stat_cpu *stat; + /* + * used when a cpu is offlined or other synchronizations + * See mem_cgroup_read_stat(). + */ + struct mem_cgroup_stat_cpu nocpu_base; + spinlock_t pcp_counter_lock; }; /* Stuffs for move charges at task migration. */ @@ -268,6 +278,7 @@ enum move_type { /* "mc" and its members are protected by cgroup_mutex */ static struct move_charge_struct { + spinlock_t lock; /* for from, to, moving_task */ struct mem_cgroup *from; struct mem_cgroup *to; unsigned long precharge; @@ -276,6 +287,7 @@ static struct move_charge_struct { struct task_struct *moving_task; /* a task moving charges */ wait_queue_head_t waitq; /* a waitq for other context */ } mc = { + .lock = __SPIN_LOCK_UNLOCKED(mc.lock), .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), }; @@ -527,14 +539,40 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) return mz; } +/* + * Implementation Note: reading percpu statistics for memcg. + * + * Both of vmstat[] and percpu_counter has threshold and do periodic + * synchronization to implement "quick" read. There are trade-off between + * reading cost and precision of value. Then, we may have a chance to implement + * a periodic synchronizion of counter in memcg's counter. + * + * But this _read() function is used for user interface now. The user accounts + * memory usage by memory cgroup and he _always_ requires exact value because + * he accounts memory. Even if we provide quick-and-fuzzy read, we always + * have to visit all online cpus and make sum. So, for now, unnecessary + * synchronization is not implemented. (just implemented for cpu hotplug) + * + * If there are kernel internal actions which can make use of some not-exact + * value, and reading all cpu value can be performance bottleneck in some + * common workload, threashold and synchonization as vmstat[] should be + * implemented. + */ static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, enum mem_cgroup_stat_index idx) { int cpu; s64 val = 0; - for_each_possible_cpu(cpu) + get_online_cpus(); + for_each_online_cpu(cpu) val += per_cpu(mem->stat->count[idx], cpu); +#ifdef CONFIG_HOTPLUG_CPU + spin_lock(&mem->pcp_counter_lock); + val += mem->nocpu_base.count[idx]; + spin_unlock(&mem->pcp_counter_lock); +#endif + put_online_cpus(); return val; } @@ -656,40 +694,83 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return mem; } -/* - * Call callback function against all cgroup under hierarchy tree. - */ -static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, - int (*func)(struct mem_cgroup *, void *)) +/* The caller has to guarantee "mem" exists before calling this */ +static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) { - int found, ret, nextid; struct cgroup_subsys_state *css; - struct mem_cgroup *mem; - - if (!root->use_hierarchy) - return (*func)(root, data); + int found; - nextid = 1; - do { - ret = 0; + if (!mem) /* ROOT cgroup has the smallest ID */ + return root_mem_cgroup; /*css_put/get against root is ignored*/ + if (!mem->use_hierarchy) { + if (css_tryget(&mem->css)) + return mem; + return NULL; + } + rcu_read_lock(); + /* + * searching a memory cgroup which has the smallest ID under given + * ROOT cgroup. (ID >= 1) + */ + css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); + if (css && css_tryget(css)) + mem = container_of(css, struct mem_cgroup, css); + else mem = NULL; + rcu_read_unlock(); + return mem; +} + +static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, + struct mem_cgroup *root, + bool cond) +{ + int nextid = css_id(&iter->css) + 1; + int found; + int hierarchy_used; + struct cgroup_subsys_state *css; + hierarchy_used = iter->use_hierarchy; + + css_put(&iter->css); + /* If no ROOT, walk all, ignore hierarchy */ + if (!cond || (root && !hierarchy_used)) + return NULL; + + if (!root) + root = root_mem_cgroup; + + do { + iter = NULL; rcu_read_lock(); - css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, - &found); + + css = css_get_next(&mem_cgroup_subsys, nextid, + &root->css, &found); if (css && css_tryget(css)) - mem = container_of(css, struct mem_cgroup, css); + iter = container_of(css, struct mem_cgroup, css); rcu_read_unlock(); - - if (mem) { - ret = (*func)(mem, data); - css_put(&mem->css); - } + /* If css is NULL, no more cgroups will be found */ nextid = found + 1; - } while (!ret && css); + } while (css && !iter); - return ret; + return iter; } +/* + * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please + * be careful that "break" loop is not allowed. We have reference count. + * Instead of that modify "cond" to be false and "continue" to exit the loop. + */ +#define for_each_mem_cgroup_tree_cond(iter, root, cond) \ + for (iter = mem_cgroup_start_loop(root);\ + iter != NULL;\ + iter = mem_cgroup_get_next(iter, root, cond)) + +#define for_each_mem_cgroup_tree(iter, root) \ + for_each_mem_cgroup_tree_cond(iter, root, true) + +#define for_each_mem_cgroup_all(iter) \ + for_each_mem_cgroup_tree_cond(iter, NULL, true) + static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) { @@ -836,12 +917,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) { int ret; struct mem_cgroup *curr = NULL; + struct task_struct *p; - task_lock(task); - rcu_read_lock(); - curr = try_get_mem_cgroup_from_mm(task->mm); - rcu_read_unlock(); - task_unlock(task); + p = find_lock_task_mm(task); + if (!p) + return 0; + curr = try_get_mem_cgroup_from_mm(p->mm); + task_unlock(p); if (!curr) return 0; /* @@ -858,35 +940,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) return ret; } -/* - * prev_priority control...this will be used in memory reclaim path. - */ -int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) -{ - int prev_priority; - - spin_lock(&mem->reclaim_param_lock); - prev_priority = mem->prev_priority; - spin_unlock(&mem->reclaim_param_lock); - - return prev_priority; -} - -void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) -{ - spin_lock(&mem->reclaim_param_lock); - if (priority < mem->prev_priority) - mem->prev_priority = priority; - spin_unlock(&mem->reclaim_param_lock); -} - -void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) -{ - spin_lock(&mem->reclaim_param_lock); - mem->prev_priority = priority; - spin_unlock(&mem->reclaim_param_lock); -} - static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) { unsigned long active; @@ -944,7 +997,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone, enum lru_list lru) { - int nid = zone->zone_pgdat->node_id; + int nid = zone_to_nid(zone); int zid = zone_idx(zone); struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); @@ -954,7 +1007,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone) { - int nid = zone->zone_pgdat->node_id; + int nid = zone_to_nid(zone); int zid = zone_idx(zone); struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); @@ -999,7 +1052,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, LIST_HEAD(pc_list); struct list_head *src; struct page_cgroup *pc, *tmp; - int nid = z->zone_pgdat->node_id; + int nid = zone_to_nid(z); int zid = zone_idx(z); struct mem_cgroup_per_zone *mz; int lru = LRU_FILE * file + active; @@ -1038,6 +1091,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, } *scanned = scan; + + trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, + 0, 0, 0, mode); + return nr_taken; } @@ -1072,11 +1129,90 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) return swappiness; } -static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) +static void mem_cgroup_start_move(struct mem_cgroup *mem) { - int *val = data; - (*val)++; - return 0; + int cpu; + + get_online_cpus(); + spin_lock(&mem->pcp_counter_lock); + for_each_online_cpu(cpu) + per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; + mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; + spin_unlock(&mem->pcp_counter_lock); + put_online_cpus(); + + synchronize_rcu(); +} + +static void mem_cgroup_end_move(struct mem_cgroup *mem) +{ + int cpu; + + if (!mem) + return; + get_online_cpus(); + spin_lock(&mem->pcp_counter_lock); + for_each_online_cpu(cpu) + per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; + mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; + spin_unlock(&mem->pcp_counter_lock); + put_online_cpus(); +} +/* + * 2 routines for checking "mem" is under move_account() or not. + * + * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used + * for avoiding race in accounting. If true, + * pc->mem_cgroup may be overwritten. + * + * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or + * under hierarchy of moving cgroups. This is for + * waiting at hith-memory prressure caused by "move". + */ + +static bool mem_cgroup_stealed(struct mem_cgroup *mem) +{ + VM_BUG_ON(!rcu_read_lock_held()); + return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; +} + +static bool mem_cgroup_under_move(struct mem_cgroup *mem) +{ + struct mem_cgroup *from; + struct mem_cgroup *to; + bool ret = false; + /* + * Unlike task_move routines, we access mc.to, mc.from not under + * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. + */ + spin_lock(&mc.lock); + from = mc.from; + to = mc.to; + if (!from) + goto unlock; + if (from == mem || to == mem + || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) + || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) + ret = true; +unlock: + spin_unlock(&mc.lock); + return ret; +} + +static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) +{ + if (mc.moving_task && current != mc.moving_task) { + if (mem_cgroup_under_move(mem)) { + DEFINE_WAIT(wait); + prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); + /* moving charge context might have finished. */ + if (mc.moving_task) + schedule(); + finish_wait(&mc.waitq, &wait); + return true; + } + } + return false; } /** @@ -1153,11 +1289,32 @@ done: static int mem_cgroup_count_children(struct mem_cgroup *mem) { int num = 0; - mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, mem) + num++; return num; } /* + * Return the memory (and swap, if configured) limit for a memcg. + */ +u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) +{ + u64 limit; + u64 memsw; + + limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + + total_swap_pages; + memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); + /* + * If memsw is finite and limits the amount of swap space available + * to this memcg, return that limit. + */ + return min(limit, memsw); +} + +/* * Visit the first child (need not be the first child as per the ordering * of the cgroup list, since we track last_scanned_child) of @mem and use * that to reclaim free pages from. @@ -1262,8 +1419,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, /* we use swappiness of local cgroup */ if (check_soft) ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, get_swappiness(victim), zone, - zone->zone_pgdat->node_id); + noswap, get_swappiness(victim), zone); else ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, get_swappiness(victim)); @@ -1285,49 +1441,39 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, return total; } -static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) -{ - int *val = (int *)data; - int x; - /* - * Logically, we can stop scanning immediately when we find - * a memcg is already locked. But condidering unlock ops and - * creation/removal of memcg, scan-all is simple operation. - */ - x = atomic_inc_return(&mem->oom_lock); - *val = max(x, *val); - return 0; -} /* * Check OOM-Killer is already running under our hierarchy. * If someone is running, return false. */ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) { - int lock_count = 0; + int x, lock_count = 0; + struct mem_cgroup *iter; - mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); + for_each_mem_cgroup_tree(iter, mem) { + x = atomic_inc_return(&iter->oom_lock); + lock_count = max(x, lock_count); + } if (lock_count == 1) return true; return false; } -static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) +static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) { + struct mem_cgroup *iter; + /* * When a new child is created while the hierarchy is under oom, * mem_cgroup_oom_lock() may not be called. We have to use * atomic_add_unless() here. */ - atomic_add_unless(&mem->oom_lock, -1, 0); + for_each_mem_cgroup_tree(iter, mem) + atomic_add_unless(&iter->oom_lock, -1, 0); return 0; } -static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) -{ - mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); -} static DEFINE_MUTEX(memcg_oom_mutex); static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); @@ -1370,7 +1516,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem) static void memcg_oom_recover(struct mem_cgroup *mem) { - if (atomic_read(&mem->oom_lock)) + if (mem && atomic_read(&mem->oom_lock)) memcg_wakeup_oom(mem); } @@ -1425,34 +1571,73 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) /* * Currently used to update mapped file statistics, but the routine can be * generalized to update other statistics as well. + * + * Notes: Race condition + * + * We usually use page_cgroup_lock() for accessing page_cgroup member but + * it tends to be costly. But considering some conditions, we doesn't need + * to do so _always_. + * + * Considering "charge", lock_page_cgroup() is not required because all + * file-stat operations happen after a page is attached to radix-tree. There + * are no race with "charge". + * + * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup + * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even + * if there are race with "uncharge". Statistics itself is properly handled + * by flags. + * + * Considering "move", this is an only case we see a race. To make the race + * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are + * possibility of race condition. If there is, we take a lock. */ -void mem_cgroup_update_file_mapped(struct page *page, int val) + +static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) { struct mem_cgroup *mem; - struct page_cgroup *pc; + struct page_cgroup *pc = lookup_page_cgroup(page); + bool need_unlock = false; - pc = lookup_page_cgroup(page); if (unlikely(!pc)) return; - lock_page_cgroup(pc); + rcu_read_lock(); mem = pc->mem_cgroup; - if (!mem || !PageCgroupUsed(pc)) - goto done; + if (unlikely(!mem || !PageCgroupUsed(pc))) + goto out; + /* pc->mem_cgroup is unstable ? */ + if (unlikely(mem_cgroup_stealed(mem))) { + /* take a lock against to access pc->mem_cgroup */ + lock_page_cgroup(pc); + need_unlock = true; + mem = pc->mem_cgroup; + if (!mem || !PageCgroupUsed(pc)) + goto out; + } - /* - * Preemption is already disabled. We can use __this_cpu_xxx - */ - if (val > 0) { - __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); - SetPageCgroupFileMapped(pc); - } else { - __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); - ClearPageCgroupFileMapped(pc); + this_cpu_add(mem->stat->count[idx], val); + + switch (idx) { + case MEM_CGROUP_STAT_FILE_MAPPED: + if (val > 0) + SetPageCgroupFileMapped(pc); + else if (!page_mapped(page)) + ClearPageCgroupFileMapped(pc); + break; + default: + BUG(); } -done: - unlock_page_cgroup(pc); +out: + if (unlikely(need_unlock)) + unlock_page_cgroup(pc); + rcu_read_unlock(); + return; +} + +void mem_cgroup_update_file_mapped(struct page *page, int val) +{ + mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val); } /* @@ -1568,30 +1753,137 @@ static void drain_all_stock_sync(void) atomic_dec(&memcg_drain_count); } -static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, +/* + * This function drains percpu counter value from DEAD cpu and + * move it to local cpu. Note that this function can be preempted. + */ +static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) +{ + int i; + + spin_lock(&mem->pcp_counter_lock); + for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { + s64 x = per_cpu(mem->stat->count[i], cpu); + + per_cpu(mem->stat->count[i], cpu) = 0; + mem->nocpu_base.count[i] += x; + } + /* need to clear ON_MOVE value, works as a kind of lock. */ + per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; + spin_unlock(&mem->pcp_counter_lock); +} + +static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) +{ + int idx = MEM_CGROUP_ON_MOVE; + + spin_lock(&mem->pcp_counter_lock); + per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; + spin_unlock(&mem->pcp_counter_lock); +} + +static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { int cpu = (unsigned long)hcpu; struct memcg_stock_pcp *stock; + struct mem_cgroup *iter; + + if ((action == CPU_ONLINE)) { + for_each_mem_cgroup_all(iter) + synchronize_mem_cgroup_on_move(iter, cpu); + return NOTIFY_OK; + } - if (action != CPU_DEAD) + if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) return NOTIFY_OK; + + for_each_mem_cgroup_all(iter) + mem_cgroup_drain_pcp_counter(iter, cpu); + stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); return NOTIFY_OK; } + +/* See __mem_cgroup_try_charge() for details */ +enum { + CHARGE_OK, /* success */ + CHARGE_RETRY, /* need to retry but retry is not bad */ + CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ + CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ + CHARGE_OOM_DIE, /* the current is killed because of OOM */ +}; + +static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, + int csize, bool oom_check) +{ + struct mem_cgroup *mem_over_limit; + struct res_counter *fail_res; + unsigned long flags = 0; + int ret; + + ret = res_counter_charge(&mem->res, csize, &fail_res); + + if (likely(!ret)) { + if (!do_swap_account) + return CHARGE_OK; + ret = res_counter_charge(&mem->memsw, csize, &fail_res); + if (likely(!ret)) + return CHARGE_OK; + + mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); + flags |= MEM_CGROUP_RECLAIM_NOSWAP; + } else + mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + + if (csize > PAGE_SIZE) /* change csize and retry */ + return CHARGE_RETRY; + + if (!(gfp_mask & __GFP_WAIT)) + return CHARGE_WOULDBLOCK; + + ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, + gfp_mask, flags); + /* + * try_to_free_mem_cgroup_pages() might not give us a full + * picture of reclaim. Some pages are reclaimed and might be + * moved to swap cache or just unmapped from the cgroup. + * Check the limit again to see if the reclaim reduced the + * current usage of the cgroup before giving up + */ + if (ret || mem_cgroup_check_under_limit(mem_over_limit)) + return CHARGE_RETRY; + + /* + * At task move, charge accounts can be doubly counted. So, it's + * better to wait until the end of task_move if something is going on. + */ + if (mem_cgroup_wait_acct_move(mem_over_limit)) + return CHARGE_RETRY; + + /* If we don't need to call oom-killer at el, return immediately */ + if (!oom_check) + return CHARGE_NOMEM; + /* check OOM */ + if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) + return CHARGE_OOM_DIE; + + return CHARGE_RETRY; +} + /* * Unlike exported interface, "oom" parameter is added. if oom==true, * oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) + gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) { - struct mem_cgroup *mem, *mem_over_limit; - int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct res_counter *fail_res; + int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; + struct mem_cgroup *mem = NULL; + int ret; int csize = CHARGE_SIZE; /* @@ -1609,126 +1901,108 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, * thread group leader migrates. It's possible that mm is not * set, if so charge the init_mm (happens for pagecache usage). */ - mem = *memcg; - if (likely(!mem)) { - mem = try_get_mem_cgroup_from_mm(mm); - *memcg = mem; - } else { - css_get(&mem->css); - } - if (unlikely(!mem)) - return 0; - - VM_BUG_ON(css_is_removed(&mem->css)); - if (mem_cgroup_is_root(mem)) - goto done; - - while (1) { - int ret = 0; - unsigned long flags = 0; - + if (!*memcg && !mm) + goto bypass; +again: + if (*memcg) { /* css should be a valid one */ + mem = *memcg; + VM_BUG_ON(css_is_removed(&mem->css)); + if (mem_cgroup_is_root(mem)) + goto done; if (consume_stock(mem)) goto done; + css_get(&mem->css); + } else { + struct task_struct *p; - ret = res_counter_charge(&mem->res, csize, &fail_res); - if (likely(!ret)) { - if (!do_swap_account) - break; - ret = res_counter_charge(&mem->memsw, csize, &fail_res); - if (likely(!ret)) - break; - /* mem+swap counter fails */ - res_counter_uncharge(&mem->res, csize); - flags |= MEM_CGROUP_RECLAIM_NOSWAP; - mem_over_limit = mem_cgroup_from_res_counter(fail_res, - memsw); - } else - /* mem counter fails */ - mem_over_limit = mem_cgroup_from_res_counter(fail_res, - res); - - /* reduce request size and retry */ - if (csize > PAGE_SIZE) { - csize = PAGE_SIZE; - continue; - } - if (!(gfp_mask & __GFP_WAIT)) - goto nomem; - - ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags); - if (ret) - continue; - + rcu_read_lock(); + p = rcu_dereference(mm->owner); + VM_BUG_ON(!p); /* - * try_to_free_mem_cgroup_pages() might not give us a full - * picture of reclaim. Some pages are reclaimed and might be - * moved to swap cache or just unmapped from the cgroup. - * Check the limit again to see if the reclaim reduced the - * current usage of the cgroup before giving up - * + * because we don't have task_lock(), "p" can exit while + * we're here. In that case, "mem" can point to root + * cgroup but never be NULL. (and task_struct itself is freed + * by RCU, cgroup itself is RCU safe.) Then, we have small + * risk here to get wrong cgroup. But such kind of mis-account + * by race always happens because we don't have cgroup_mutex(). + * It's overkill and we allow that small race, here. */ - if (mem_cgroup_check_under_limit(mem_over_limit)) - continue; - - /* try to avoid oom while someone is moving charge */ - if (mc.moving_task && current != mc.moving_task) { - struct mem_cgroup *from, *to; - bool do_continue = false; + mem = mem_cgroup_from_task(p); + VM_BUG_ON(!mem); + if (mem_cgroup_is_root(mem)) { + rcu_read_unlock(); + goto done; + } + if (consume_stock(mem)) { /* - * There is a small race that "from" or "to" can be - * freed by rmdir, so we use css_tryget(). + * It seems dagerous to access memcg without css_get(). + * But considering how consume_stok works, it's not + * necessary. If consume_stock success, some charges + * from this memcg are cached on this cpu. So, we + * don't need to call css_get()/css_tryget() before + * calling consume_stock(). */ - from = mc.from; - to = mc.to; - if (from && css_tryget(&from->css)) { - if (mem_over_limit->use_hierarchy) - do_continue = css_is_ancestor( - &from->css, - &mem_over_limit->css); - else - do_continue = (from == mem_over_limit); - css_put(&from->css); - } - if (!do_continue && to && css_tryget(&to->css)) { - if (mem_over_limit->use_hierarchy) - do_continue = css_is_ancestor( - &to->css, - &mem_over_limit->css); - else - do_continue = (to == mem_over_limit); - css_put(&to->css); - } - if (do_continue) { - DEFINE_WAIT(wait); - prepare_to_wait(&mc.waitq, &wait, - TASK_INTERRUPTIBLE); - /* moving charge context might have finished. */ - if (mc.moving_task) - schedule(); - finish_wait(&mc.waitq, &wait); - continue; - } + rcu_read_unlock(); + goto done; + } + /* after here, we may be blocked. we need to get refcnt */ + if (!css_tryget(&mem->css)) { + rcu_read_unlock(); + goto again; } + rcu_read_unlock(); + } + + do { + bool oom_check; + + /* If killed, bypass charge */ + if (fatal_signal_pending(current)) { + css_put(&mem->css); + goto bypass; + } + + oom_check = false; + if (oom && !nr_oom_retries) { + oom_check = true; + nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; + } + + ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); - if (!nr_retries--) { - if (!oom) + switch (ret) { + case CHARGE_OK: + break; + case CHARGE_RETRY: /* not in OOM situation but retry */ + csize = PAGE_SIZE; + css_put(&mem->css); + mem = NULL; + goto again; + case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ + css_put(&mem->css); + goto nomem; + case CHARGE_NOMEM: /* OOM routine works */ + if (!oom) { + css_put(&mem->css); goto nomem; - if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { - nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - continue; } - /* When we reach here, current task is dying .*/ + /* If oom, we never return -ENOMEM */ + nr_oom_retries--; + break; + case CHARGE_OOM_DIE: /* Killed by OOM Killer */ css_put(&mem->css); goto bypass; } - } + } while (ret != CHARGE_OK); + if (csize > PAGE_SIZE) refill_stock(mem, csize - PAGE_SIZE); + css_put(&mem->css); done: + *memcg = mem; return 0; nomem: - css_put(&mem->css); + *memcg = NULL; return -ENOMEM; bypass: *memcg = NULL; @@ -1747,11 +2021,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, res_counter_uncharge(&mem->res, PAGE_SIZE * count); if (do_swap_account) res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); - VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); - WARN_ON_ONCE(count > INT_MAX); - __css_put(&mem->css, (int)count); } - /* we don't need css_put for root */ } static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) @@ -1979,10 +2249,9 @@ out: * < 0 if the cgroup is over its limit */ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, enum charge_type ctype, - struct mem_cgroup *memcg) + gfp_t gfp_mask, enum charge_type ctype) { - struct mem_cgroup *mem; + struct mem_cgroup *mem = NULL; struct page_cgroup *pc; int ret; @@ -1992,7 +2261,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, return 0; prefetchw(pc); - mem = memcg; ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); if (ret || !mem) return ret; @@ -2020,7 +2288,7 @@ int mem_cgroup_newpage_charge(struct page *page, if (unlikely(!mm)) mm = &init_mm; return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); + MEM_CGROUP_CHARGE_TYPE_MAPPED); } static void @@ -2030,7 +2298,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - struct mem_cgroup *mem = NULL; int ret; if (mem_cgroup_disabled()) @@ -2051,7 +2318,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (!(gfp_mask & __GFP_WAIT)) { struct page_cgroup *pc; - pc = lookup_page_cgroup(page); if (!pc) return 0; @@ -2063,22 +2329,24 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, unlock_page_cgroup(pc); } - if (unlikely(!mm && !mem)) + if (unlikely(!mm)) mm = &init_mm; if (page_is_file_cache(page)) return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); + MEM_CGROUP_CHARGE_TYPE_CACHE); /* shmem */ if (PageSwapCache(page)) { + struct mem_cgroup *mem = NULL; + ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); if (!ret) __mem_cgroup_commit_charge_swapin(page, mem, MEM_CGROUP_CHARGE_TYPE_SHMEM); } else ret = mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); + MEM_CGROUP_CHARGE_TYPE_SHMEM); return ret; } @@ -2114,7 +2382,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, goto charge_cur_mm; *ptr = mem; ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); - /* drop extra refcnt from tryget */ css_put(&mem->css); return ret; charge_cur_mm: @@ -2245,7 +2512,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { struct page_cgroup *pc; struct mem_cgroup *mem = NULL; - struct mem_cgroup_per_zone *mz; if (mem_cgroup_disabled()) return NULL; @@ -2285,10 +2551,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - if (!mem_cgroup_is_root(mem)) - __do_uncharge(mem, ctype); - if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) - mem_cgroup_swap_statistics(mem, true); mem_cgroup_charge_statistics(mem, pc, false); ClearPageCgroupUsed(pc); @@ -2299,13 +2561,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) * special functions. */ - mz = page_cgroup_zoneinfo(pc); unlock_page_cgroup(pc); - + /* + * even after unlock, we have mem->res.usage here and this memcg + * will never be freed. + */ memcg_check_events(mem, page); - /* at swapout, this memcg will be accessed to record to swap */ - if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) - css_put(&mem->css); + if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { + mem_cgroup_swap_statistics(mem, true); + mem_cgroup_get(mem); + } + if (!mem_cgroup_is_root(mem)) + __do_uncharge(mem, ctype); return mem; @@ -2392,13 +2659,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) memcg = __mem_cgroup_uncharge_common(page, ctype); - /* record memcg information */ - if (do_swap_account && swapout && memcg) { + /* + * record memcg information, if swapout && memcg != NULL, + * mem_cgroup_get() was called in uncharge(). + */ + if (do_swap_account && swapout && memcg) swap_cgroup_record(ent, css_id(&memcg->css)); - mem_cgroup_get(memcg); - } - if (swapout && memcg) - css_put(&memcg->css); } #endif @@ -2476,7 +2742,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, */ if (!mem_cgroup_is_root(to)) res_counter_uncharge(&to->res, PAGE_SIZE); - css_put(&to->css); } return 0; } @@ -2611,11 +2876,8 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, ClearPageCgroupMigration(pc); unlock_page_cgroup(pc); - if (unused != oldpage) - pc = lookup_page_cgroup(unused); __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); - pc = lookup_page_cgroup(used); /* * If a page is a file cache, radix-tree replacement is very atomic * and we can skip this check. When it was an Anon page, its mapcount @@ -2791,8 +3053,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, } unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, - gfp_t gfp_mask, int nid, - int zid) + gfp_t gfp_mask) { unsigned long nr_reclaimed = 0; struct mem_cgroup_per_zone *mz, *next_mz = NULL; @@ -2804,7 +3065,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, if (order > 0) return 0; - mctz = soft_limit_tree_node_zone(nid, zid); + mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); /* * This loop can run a while, specially if mem_cgroup's continuously * keep exceeding their soft limit and putting the system under @@ -2965,6 +3226,7 @@ move_account: lru_add_drain_all(); drain_all_stock_sync(); ret = 0; + mem_cgroup_start_move(mem); for_each_node_state(node, N_HIGH_MEMORY) { for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { enum lru_list l; @@ -2978,6 +3240,7 @@ move_account: if (ret) break; } + mem_cgroup_end_move(mem); memcg_oom_recover(mem); /* it seems parent cgroup doesn't have enough mem */ if (ret == -ENOMEM) @@ -3064,33 +3327,25 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, return retval; } -struct mem_cgroup_idx_data { - s64 val; - enum mem_cgroup_stat_index idx; -}; -static int -mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) +static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, + enum mem_cgroup_stat_index idx) { - struct mem_cgroup_idx_data *d = data; - d->val += mem_cgroup_read_stat(mem, d->idx); - return 0; -} + struct mem_cgroup *iter; + s64 val = 0; -static void -mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, - enum mem_cgroup_stat_index idx, s64 *val) -{ - struct mem_cgroup_idx_data d; - d.idx = idx; - d.val = 0; - mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); - *val = d.val; + /* each per cpu's value can be minus.Then, use s64 */ + for_each_mem_cgroup_tree(iter, mem) + val += mem_cgroup_read_stat(iter, idx); + + if (val < 0) /* race ? */ + val = 0; + return val; } static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) { - u64 idx_val, val; + u64 val; if (!mem_cgroup_is_root(mem)) { if (!swap) @@ -3099,16 +3354,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) return res_counter_read_u64(&mem->memsw, RES_USAGE); } - mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); - val = idx_val; - mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val); - val += idx_val; + val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS); - if (swap) { - mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_SWAPOUT, &idx_val); - val += idx_val; - } + if (swap) + val += mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_SWAPOUT); return val << PAGE_SHIFT; } @@ -3316,9 +3567,9 @@ struct { }; -static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) +static void +mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) { - struct mcs_total_stat *s = data; s64 val; /* per cpu stat */ @@ -3348,13 +3599,15 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; - return 0; } static void mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) { - mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, mem) + mem_cgroup_get_local_stat(iter, s); } static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, @@ -3514,9 +3767,13 @@ unlock: static void mem_cgroup_threshold(struct mem_cgroup *memcg) { - __mem_cgroup_threshold(memcg, false); - if (do_swap_account) - __mem_cgroup_threshold(memcg, true); + while (memcg) { + __mem_cgroup_threshold(memcg, false); + if (do_swap_account) + __mem_cgroup_threshold(memcg, true); + + memcg = parent_mem_cgroup(memcg); + } } static int compare_thresholds(const void *a, const void *b) @@ -3527,7 +3784,7 @@ static int compare_thresholds(const void *a, const void *b) return _a->threshold - _b->threshold; } -static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) { struct mem_cgroup_eventfd_list *ev; @@ -3538,7 +3795,10 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) static void mem_cgroup_oom_notify(struct mem_cgroup *mem) { - mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, mem) + mem_cgroup_oom_notify_cb(iter); } static int mem_cgroup_usage_register_event(struct cgroup *cgrp, @@ -3759,8 +4019,6 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp, return 0; } -/* - */ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, struct cftype *cft, u64 val) { @@ -3957,6 +4215,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) vfree(mem); mem = NULL; } + spin_lock_init(&mem->pcp_counter_lock); return mem; } @@ -4083,7 +4342,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) &per_cpu(memcg_stock, cpu); INIT_WORK(&stock->work, drain_local_stock); } - hotcpu_notifier(memcg_stock_cpu_callback, 0); + hotcpu_notifier(memcg_cpu_hotplug_callback, 0); } else { parent = mem_cgroup_from_cont(cont->parent); mem->use_hierarchy = parent->use_hierarchy; @@ -4180,9 +4439,6 @@ static int mem_cgroup_do_precharge(unsigned long count) goto one_by_one; } mc.precharge += count; - VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); - WARN_ON_ONCE(count > INT_MAX); - __css_get(&mem->css, (int)count); return ret; } one_by_one: @@ -4400,11 +4656,13 @@ static int mem_cgroup_precharge_mc(struct mm_struct *mm) static void mem_cgroup_clear_mc(void) { + struct mem_cgroup *from = mc.from; + struct mem_cgroup *to = mc.to; + /* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { __mem_cgroup_cancel_charge(mc.to, mc.precharge); mc.precharge = 0; - memcg_oom_recover(mc.to); } /* * we didn't uncharge from mc.from at mem_cgroup_move_account(), so @@ -4413,11 +4671,9 @@ static void mem_cgroup_clear_mc(void) if (mc.moved_charge) { __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); mc.moved_charge = 0; - memcg_oom_recover(mc.from); } /* we must fixup refcnts and charges */ if (mc.moved_swap) { - WARN_ON_ONCE(mc.moved_swap > INT_MAX); /* uncharge swap account from the old cgroup */ if (!mem_cgroup_is_root(mc.from)) res_counter_uncharge(&mc.from->memsw, @@ -4431,16 +4687,19 @@ static void mem_cgroup_clear_mc(void) */ res_counter_uncharge(&mc.to->res, PAGE_SIZE * mc.moved_swap); - VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags)); - __css_put(&mc.to->css, mc.moved_swap); } /* we've already done mem_cgroup_get(mc.to) */ mc.moved_swap = 0; } + spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; mc.moving_task = NULL; + spin_unlock(&mc.lock); + mem_cgroup_end_move(from); + memcg_oom_recover(from); + memcg_oom_recover(to); wake_up_all(&mc.waitq); } @@ -4469,12 +4728,15 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_swap); VM_BUG_ON(mc.moving_task); + mem_cgroup_start_move(from); + spin_lock(&mc.lock); mc.from = from; mc.to = mem; mc.precharge = 0; mc.moved_charge = 0; mc.moved_swap = 0; mc.moving_task = current; + spin_unlock(&mc.lock); ret = mem_cgroup_precharge_mc(mm); if (ret) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 620b0b461593..124324134ff6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -7,21 +7,26 @@ * Free Software Foundation. * * High level machine check handler. Handles pages reported by the - * hardware as being corrupted usually due to a 2bit ECC memory or cache + * hardware as being corrupted usually due to a multi-bit ECC memory or cache * failure. + * + * In addition there is a "soft offline" entry point that allows stop using + * not-yet-corrupted-by-suspicious pages without killing anything. * * Handles page cache pages in various states. The tricky part - * here is that we can access any page asynchronous to other VM - * users, because memory failures could happen anytime and anywhere, - * possibly violating some of their assumptions. This is why this code - * has to be extremely careful. Generally it tries to use normal locking - * rules, as in get the standard locks, even if that means the - * error handling takes potentially a long time. - * - * The operation to map back from RMAP chains to processes has to walk - * the complete process list and has non linear complexity with the number - * mappings. In short it can be quite slow. But since memory corruptions - * are rare we hope to get away with this. + * here is that we can access any page asynchronously in respect to + * other VM users, because memory failures could happen anytime and + * anywhere. This could violate some of their assumptions. This is why + * this code has to be extremely careful. Generally it tries to use + * normal locking rules, as in get the standard locks, even if that means + * the error handling takes potentially a long time. + * + * There are several operations here with exponential complexity because + * of unsuitable VM data structures. For example the operation to map back + * from RMAP chains to processes has to walk the complete process list and + * has non linear complexity with the number. But since memory corruptions + * are rare we hope to get away with this. This avoids impacting the core + * VM. */ /* @@ -30,7 +35,6 @@ * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages * - pass bad pages to kdump next kernel */ -#define DEBUG 1 /* remove me in 2.6.34 */ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/page-flags.h> @@ -45,6 +49,8 @@ #include <linux/page-isolation.h> #include <linux/suspend.h> #include <linux/slab.h> +#include <linux/swapops.h> +#include <linux/hugetlb.h> #include "internal.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -76,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p) return 0; /* - * page_mapping() does not accept slab page + * page_mapping() does not accept slab pages. */ if (PageSlab(p)) return -EINVAL; @@ -181,7 +187,7 @@ EXPORT_SYMBOL_GPL(hwpoison_filter); * signal. */ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, - unsigned long pfn) + unsigned long pfn, struct page *page) { struct siginfo si; int ret; @@ -196,7 +202,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = PAGE_SHIFT; + si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; /* * Don't use force here, it's convenient if the signal * can be temporarily blocked. @@ -233,7 +239,7 @@ void shake_page(struct page *p, int access) int nr; do { nr = shrink_slab(1000, GFP_KERNEL, 1000); - if (page_count(p) == 0) + if (page_count(p) == 1) break; } while (nr > 10); } @@ -266,7 +272,7 @@ struct to_kill { struct list_head nd; struct task_struct *tsk; unsigned long addr; - unsigned addr_valid:1; + char addr_valid; }; /* @@ -307,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * a SIGKILL because the error is not contained anymore. */ if (tk->addr == -EFAULT) { - pr_debug("MCE: Unable to find user space address %lx in %s\n", + pr_info("MCE: Unable to find user space address %lx in %s\n", page_to_pfn(p), tsk->comm); tk->addr_valid = 0; } @@ -325,7 +331,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * wrong earlier. */ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, - int fail, unsigned long pfn) + int fail, struct page *page, unsigned long pfn) { struct to_kill *tk, *next; @@ -350,7 +356,7 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, * process anyways. */ else if (kill_proc_ao(tk->tsk, tk->addr, trapno, - pfn) < 0) + pfn, page) < 0) printk(KERN_ERR "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", pfn, tk->tsk->comm, tk->tsk->pid); @@ -575,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) pfn, err); } else if (page_has_private(p) && !try_to_release_page(p, GFP_NOIO)) { - pr_debug("MCE %#lx: failed to release buffers\n", pfn); + pr_info("MCE %#lx: failed to release buffers\n", pfn); } else { ret = RECOVERED; } @@ -689,17 +695,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) /* * Huge pages. Needs work. * Issues: - * No rmap support so we cannot find the original mapper. In theory could walk - * all MMs and look for the mappings, but that would be non atomic and racy. - * Need rmap for hugepages for this. Alternatively we could employ a heuristic, - * like just walking the current process and hoping it has it mapped (that - * should be usually true for the common "shared database cache" case) - * Should handle free huge pages and dequeue them too, but this needs to - * handle huge page accounting correctly. + * - Error on hugepage is contained in hugepage unit (not in raw page unit.) + * To narrow down kill region to one page, we need to break up pmd. */ static int me_huge_page(struct page *p, unsigned long pfn) { - return FAILED; + int res = 0; + struct page *hpage = compound_head(p); + /* + * We can safely recover from error on free or reserved (i.e. + * not in-use) hugepage by dequeuing it from freelist. + * To check whether a hugepage is in-use or not, we can't use + * page->lru because it can be used in other hugepage operations, + * such as __unmap_hugepage_range() and gather_surplus_pages(). + * So instead we use page_mapping() and PageAnon(). + * We assume that this function is called with page lock held, + * so there is no race between isolation and mapping/unmapping. + */ + if (!(page_mapping(hpage) || PageAnon(hpage))) { + res = dequeue_hwpoisoned_huge_page(hpage); + if (!res) + return RECOVERED; + } + return DELAYED; } /* @@ -822,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p, return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; } -#define N_UNMAP_TRIES 5 - /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. @@ -835,8 +851,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, struct address_space *mapping; LIST_HEAD(tokill); int ret; - int i; int kill = 1; + struct page *hpage = compound_head(p); if (PageReserved(p) || PageSlab(p)) return SWAP_SUCCESS; @@ -845,10 +861,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * This check implies we don't kill processes if their pages * are in the swap cache early. Those are always late kills. */ - if (!page_mapped(p)) + if (!page_mapped(hpage)) return SWAP_SUCCESS; - if (PageCompound(p) || PageKsm(p)) + if (PageKsm(p)) return SWAP_FAIL; if (PageSwapCache(p)) { @@ -863,10 +879,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * XXX: the dirty test could be racy: set_page_dirty() may not always * be called inside page lock (it's recommended but not enforced). */ - mapping = page_mapping(p); - if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { - if (page_mkclean(p)) { - SetPageDirty(p); + mapping = page_mapping(hpage); + if (!PageDirty(hpage) && mapping && + mapping_cap_writeback_dirty(mapping)) { + if (page_mkclean(hpage)) { + SetPageDirty(hpage); } else { kill = 0; ttu |= TTU_IGNORE_HWPOISON; @@ -885,22 +902,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(p, &tokill); - - /* - * try_to_unmap can fail temporarily due to races. - * Try a few times (RED-PEN better strategy?) - */ - for (i = 0; i < N_UNMAP_TRIES; i++) { - ret = try_to_unmap(p, ttu); - if (ret == SWAP_SUCCESS) - break; - pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); - } + collect_procs(hpage, &tokill); + ret = try_to_unmap(hpage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(p)); + pfn, page_mapcount(hpage)); /* * Now that the dirty bit has been propagated to the @@ -911,17 +918,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - kill_procs_ao(&tokill, !!PageDirty(p), trapno, - ret != SWAP_SUCCESS, pfn); + kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, + ret != SWAP_SUCCESS, p, pfn); return ret; } +static void set_page_hwpoison_huge_page(struct page *hpage) +{ + int i; + int nr_pages = 1 << compound_order(hpage); + for (i = 0; i < nr_pages; i++) + SetPageHWPoison(hpage + i); +} + +static void clear_page_hwpoison_huge_page(struct page *hpage) +{ + int i; + int nr_pages = 1 << compound_order(hpage); + for (i = 0; i < nr_pages; i++) + ClearPageHWPoison(hpage + i); +} + int __memory_failure(unsigned long pfn, int trapno, int flags) { struct page_state *ps; struct page *p; + struct page *hpage; int res; + unsigned int nr_pages; if (!sysctl_memory_failure_recovery) panic("Memory failure from trap %d on page %lx", trapno, pfn); @@ -934,18 +959,23 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) } p = pfn_to_page(pfn); + hpage = compound_head(p); if (TestSetPageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); return 0; } - atomic_long_add(1, &mce_bad_pages); + nr_pages = 1 << compound_order(hpage); + atomic_long_add(nr_pages, &mce_bad_pages); /* * We need/can do nothing about count=0 pages. * 1) it's a free page, and therefore in safe hand: * prep_new_page() will be the gate keeper. - * 2) it's part of a non-compound high order page. + * 2) it's a free hugepage, which is also safe: + * an affected hugepage will be dequeued from hugepage freelist, + * so there's no concern about reusing it ever after. + * 3) it's part of a non-compound high order page. * Implies some kernel user: cannot stop them from * R/W the page; let's pray that the page has been * used and will be freed some time later. @@ -953,10 +983,28 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. */ if (!(flags & MF_COUNT_INCREASED) && - !get_page_unless_zero(compound_head(p))) { + !get_page_unless_zero(hpage)) { if (is_free_buddy_page(p)) { action_result(pfn, "free buddy", DELAYED); return 0; + } else if (PageHuge(hpage)) { + /* + * Check "just unpoisoned", "filter hit", and + * "race with other subpage." + */ + lock_page_nosync(hpage); + if (!PageHWPoison(hpage) + || (hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != hpage && TestSetPageHWPoison(hpage))) { + atomic_long_sub(nr_pages, &mce_bad_pages); + return 0; + } + set_page_hwpoison_huge_page(hpage); + res = dequeue_hwpoisoned_huge_page(hpage); + action_result(pfn, "free huge", + res ? IGNORED : DELAYED); + unlock_page(hpage); + return res; } else { action_result(pfn, "high order kernel", IGNORED); return -EBUSY; @@ -971,9 +1019,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - if (!PageLRU(p)) + if (!PageLRU(p) && !PageHuge(p)) shake_page(p, 0); - if (!PageLRU(p)) { + if (!PageLRU(p) && !PageHuge(p)) { /* * shake_page could have turned it free. */ @@ -991,7 +1039,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here. */ - lock_page_nosync(p); + lock_page_nosync(hpage); /* * unpoison always clear PG_hwpoison inside page lock @@ -1003,12 +1051,32 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) - atomic_long_dec(&mce_bad_pages); - unlock_page(p); - put_page(p); + atomic_long_sub(nr_pages, &mce_bad_pages); + unlock_page(hpage); + put_page(hpage); return 0; } + /* + * For error on the tail page, we should set PG_hwpoison + * on the head page to show that the hugepage is hwpoisoned + */ + if (PageTail(p) && TestSetPageHWPoison(hpage)) { + action_result(pfn, "hugepage already hardware poisoned", + IGNORED); + unlock_page(hpage); + put_page(hpage); + return 0; + } + /* + * Set PG_hwpoison on all pages in an error hugepage, + * because containment is done in hugepage unit for now. + * Since we have done TestSetPageHWPoison() for the head page with + * page lock held, we can safely set PG_hwpoison bits on tail pages. + */ + if (PageHuge(p)) + set_page_hwpoison_huge_page(hpage); + wait_on_page_writeback(p); /* @@ -1038,7 +1106,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) } } out: - unlock_page(p); + unlock_page(hpage); return res; } EXPORT_SYMBOL_GPL(__memory_failure); @@ -1082,6 +1150,7 @@ int unpoison_memory(unsigned long pfn) struct page *page; struct page *p; int freeit = 0; + unsigned int nr_pages; if (!pfn_valid(pfn)) return -ENXIO; @@ -1090,14 +1159,26 @@ int unpoison_memory(unsigned long pfn) page = compound_head(p); if (!PageHWPoison(p)) { - pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); + pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); return 0; } + nr_pages = 1 << compound_order(page); + if (!get_page_unless_zero(page)) { + /* + * Since HWPoisoned hugepage should have non-zero refcount, + * race between memory failure and unpoison seems to happen. + * In such case unpoison fails and memory failure runs + * to the end. + */ + if (PageHuge(page)) { + pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); + return 0; + } if (TestClearPageHWPoison(p)) - atomic_long_dec(&mce_bad_pages); - pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); + atomic_long_sub(nr_pages, &mce_bad_pages); + pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); return 0; } @@ -1108,10 +1189,12 @@ int unpoison_memory(unsigned long pfn) * the PG_hwpoison page will be caught and isolated on the entrance to * the free buddy page pool. */ - if (TestClearPageHWPoison(p)) { - pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); - atomic_long_dec(&mce_bad_pages); + if (TestClearPageHWPoison(page)) { + pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); + atomic_long_sub(nr_pages, &mce_bad_pages); freeit = 1; + if (PageHuge(page)) + clear_page_hwpoison_huge_page(page); } unlock_page(page); @@ -1126,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory); static struct page *new_page(struct page *p, unsigned long private, int **x) { int nid = page_to_nid(p); - return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); + if (PageHuge(p)) + return alloc_huge_page_node(page_hstate(compound_head(p)), + nid); + else + return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -1154,14 +1241,21 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) * was free. */ set_migratetype_isolate(p); + /* + * When the target page is a free hugepage, just remove it + * from free hugepage list. + */ if (!get_page_unless_zero(compound_head(p))) { - if (is_free_buddy_page(p)) { - pr_debug("get_any_page: %#lx free buddy page\n", pfn); + if (PageHuge(p)) { + pr_info("get_any_page: %#lx free huge page\n", pfn); + ret = dequeue_hwpoisoned_huge_page(compound_head(p)); + } else if (is_free_buddy_page(p)) { + pr_info("get_any_page: %#lx free buddy page\n", pfn); /* Set hwpoison bit while page is still isolated */ SetPageHWPoison(p); ret = 0; } else { - pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", + pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", pfn, p->flags); ret = -EIO; } @@ -1174,6 +1268,46 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) return ret; } +static int soft_offline_huge_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_head(page); + LIST_HEAD(pagelist); + + ret = get_any_page(page, pfn, flags); + if (ret < 0) + return ret; + if (ret == 0) + goto done; + + if (PageHWPoison(hpage)) { + put_page(hpage); + pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); + return -EBUSY; + } + + /* Keep page count to indicate a given hugepage is isolated. */ + + list_add(&hpage->lru, &pagelist); + ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + if (ret) { + putback_lru_pages(&pagelist); + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", + pfn, ret, page->flags); + if (ret > 0) + ret = -EIO; + return ret; + } +done: + if (!PageHWPoison(hpage)) + atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + /* keep elevated page count for bad page */ + return ret; +} + /** * soft_offline_page - Soft offline a page. * @page: page to offline @@ -1201,6 +1335,9 @@ int soft_offline_page(struct page *page, int flags) int ret; unsigned long pfn = page_to_pfn(page); + if (PageHuge(page)) + return soft_offline_huge_page(page, flags); + ret = get_any_page(page, pfn, flags); if (ret < 0) return ret; @@ -1227,7 +1364,7 @@ int soft_offline_page(struct page *page, int flags) goto done; } if (!PageLRU(page)) { - pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", + pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", pfn, page->flags); return -EIO; } @@ -1241,7 +1378,7 @@ int soft_offline_page(struct page *page, int flags) if (PageHWPoison(page)) { unlock_page(page); put_page(page); - pr_debug("soft offline: %#lx page already poisoned\n", pfn); + pr_info("soft offline: %#lx page already poisoned\n", pfn); return -EBUSY; } @@ -1262,7 +1399,7 @@ int soft_offline_page(struct page *page, int flags) put_page(page); if (ret == 1) { ret = 0; - pr_debug("soft_offline: %#lx: invalidated\n", pfn); + pr_info("soft_offline: %#lx: invalidated\n", pfn); goto done; } @@ -1278,13 +1415,13 @@ int soft_offline_page(struct page *page, int flags) list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); if (ret) { - pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) ret = -EIO; } } else { - pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", + pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", pfn, ret, page_count(page), page->flags); } if (ret) @@ -1296,3 +1433,35 @@ done: /* keep elevated page count for bad page */ return ret; } + +/* + * The caller must hold current->mm->mmap_sem in read mode. + */ +int is_hwpoison_address(unsigned long addr) +{ + pgd_t *pgdp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pte_t pte, *ptep; + swp_entry_t entry; + + pgdp = pgd_offset(current->mm, addr); + if (!pgd_present(*pgdp)) + return 0; + pudp = pud_offset(pgdp, addr); + pud = *pudp; + if (!pud_present(pud) || pud_large(pud)) + return 0; + pmdp = pmd_offset(pudp, addr); + pmd = *pmdp; + if (!pmd_present(pmd) || pmd_large(pmd)) + return 0; + ptep = pte_offset_map(pmdp, addr); + pte = *ptep; + pte_unmap(ptep); + if (!is_swap_pte(pte)) + return 0; + entry = pte_to_swp_entry(pte); + return is_hwpoison_entry(entry); +} +EXPORT_SYMBOL_GPL(is_hwpoison_address); diff --git a/mm/memory.c b/mm/memory.c index bde42c6d3633..02e48aa0ed13 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -307,7 +307,6 @@ void free_pgd_range(struct mmu_gather *tlb, { pgd_t *pgd; unsigned long next; - unsigned long start; /* * The next few lines have given us lots of grief... @@ -351,7 +350,6 @@ void free_pgd_range(struct mmu_gather *tlb, if (addr > end - 1) return; - start = addr; pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); @@ -738,7 +736,7 @@ again: dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; - src_pte = pte_offset_map_nested(src_pmd, addr); + src_pte = pte_offset_map(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; @@ -769,7 +767,7 @@ again: arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); - pte_unmap_nested(orig_src_pte); + pte_unmap(orig_src_pte); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); @@ -1452,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; if (ret & - (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) + (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE| + VM_FAULT_SIGBUS)) return i ? i : -EFAULT; BUG(); } @@ -1592,7 +1591,7 @@ struct page *get_dump_page(unsigned long addr) } #endif /* CONFIG_ELF_CORE */ -pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, +pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pgd_t * pgd = pgd_offset(mm, addr); @@ -2008,11 +2007,10 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, { pgd_t *pgd; unsigned long next; - unsigned long start = addr, end = addr + size; + unsigned long end = addr + size; int err; BUG_ON(addr >= end); - mmu_notifier_invalidate_range_start(mm, start, end); pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); @@ -2020,7 +2018,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, if (err) break; } while (pgd++, addr = next, addr != end); - mmu_notifier_invalidate_range_end(mm, start, end); + return err; } EXPORT_SYMBOL_GPL(apply_to_page_range); @@ -2082,7 +2080,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo * zeroes. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) - memset(kaddr, 0, PAGE_SIZE); + clear_page(kaddr); kunmap_atomic(kaddr, KM_USER0); flush_dcache_page(dst); } else @@ -2110,6 +2108,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte) + __releases(ptl) { struct page *old_page, *new_page; pte_t entry; @@ -2626,10 +2625,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned int flags, pte_t orig_pte) { spinlock_t *ptl; - struct page *page; + struct page *page, *swapcache = NULL; swp_entry_t entry; pte_t pte; + int locked; struct mem_cgroup *ptr = NULL; + int exclusive = 0; int ret = 0; if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) @@ -2678,13 +2679,32 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_release; } - lock_page(page); + locked = lock_page_or_retry(page, mm, flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + if (!locked) { + ret |= VM_FAULT_RETRY; + goto out_release; + } - page = ksm_might_need_to_copy(page, vma, address); - if (!page) { - ret = VM_FAULT_OOM; - goto out; + /* + * Make sure try_to_free_swap or reuse_swap_page or swapoff did not + * release the swapcache from under us. The page pin, and pte_same + * test below, are not enough to exclude that. Even if it is still + * swapcache, we need to check that the page's swap has not changed. + */ + if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) + goto out_page; + + if (ksm_might_need_to_copy(page, vma, address)) { + swapcache = page; + page = ksm_does_need_to_copy(page, vma, address); + + if (unlikely(!page)) { + ret = VM_FAULT_OOM; + page = swapcache; + swapcache = NULL; + goto out_page; + } } if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { @@ -2724,10 +2744,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); flags &= ~FAULT_FLAG_WRITE; + ret |= VM_FAULT_WRITE; + exclusive = 1; } flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); - page_add_anon_rmap(page, vma, address); + do_page_add_anon_rmap(page, vma, address, exclusive); /* It's better to call commit-charge after rmap is established */ mem_cgroup_commit_charge_swapin(page, ptr); @@ -2735,6 +2757,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); + if (swapcache) { + /* + * Hold the lock to avoid the swap entry to be reused + * until we take the PT lock for the pte_same() check + * (to avoid false positives from pte_same). For + * further safety release the lock after the swap_free + * so that the swap count won't change under a + * parallel locked swapcache. + */ + unlock_page(swapcache); + page_cache_release(swapcache); + } if (flags & FAULT_FLAG_WRITE) { ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); @@ -2756,10 +2790,48 @@ out_page: unlock_page(page); out_release: page_cache_release(page); + if (swapcache) { + unlock_page(swapcache); + page_cache_release(swapcache); + } return ret; } /* + * This is like a special single-page "expand_{down|up}wards()", + * except we must first make sure that 'address{-|+}PAGE_SIZE' + * doesn't hit another vma. + */ +static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) +{ + address &= PAGE_MASK; + if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) { + struct vm_area_struct *prev = vma->vm_prev; + + /* + * Is there a mapping abutting this one below? + * + * That's only ok if it's the same stack mapping + * that has gotten split.. + */ + if (prev && prev->vm_end == address) + return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; + + expand_stack(vma, address - PAGE_SIZE); + } + if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { + struct vm_area_struct *next = vma->vm_next; + + /* As VM_GROWSDOWN but s/below/above/ */ + if (next && next->vm_start == address + PAGE_SIZE) + return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; + + expand_upwards(vma, address + PAGE_SIZE); + } + return 0; +} + +/* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. @@ -2772,19 +2844,23 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; pte_t entry; + pte_unmap(page_table); + + /* Check if we need to add a guard page to the stack */ + if (check_stack_guard_page(vma, address) < 0) + return VM_FAULT_SIGBUS; + + /* Use the zero-page for reads */ if (!(flags & FAULT_FLAG_WRITE)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), vma->vm_page_prot)); - ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto unlock; goto setpte; } /* Allocate our own private page. */ - pte_unmap(page_table); - if (unlikely(anon_vma_prepare(vma))) goto oom; page = alloc_zeroed_user_highpage_movable(vma, address); @@ -2857,7 +2933,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, vmf.page = NULL; ret = vma->vm_ops->fault(vma, &vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | + VM_FAULT_RETRY))) return ret; if (unlikely(PageHWPoison(vmf.page))) { @@ -3116,7 +3193,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, * with threads. */ if (flags & FAULT_FLAG_WRITE) - flush_tlb_page(vma, address); + flush_tlb_fix_spurious_fault(vma, address); } unlock: pte_unmap_unlock(pte, ptl); @@ -3274,7 +3351,7 @@ int in_gate_area_no_task(unsigned long addr) #endif /* __HAVE_ARCH_GATE_AREA */ -static int follow_pte(struct mm_struct *mm, unsigned long address, +static int __follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp) { pgd_t *pgd; @@ -3311,6 +3388,17 @@ out: return -EINVAL; } +static inline int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) +{ + int res; + + /* (void) is needed to make gcc happy */ + (void) __cond_lock(*ptlp, + !(res = __follow_pte(mm, address, ptepp, ptlp))); + return res; +} + /** * follow_pfn - look up PFN at a user virtual address * @vma: memory mapping diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a4cfcdc00455..9260314a221e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -584,45 +584,32 @@ static inline int pageblock_free(struct page *page) /* Return the start of the next active pageblock after a given page */ static struct page *next_active_pageblock(struct page *page) { - int pageblocks_stride; - /* Ensure the starting page is pageblock-aligned */ BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); - /* Move forward by at least 1 * pageblock_nr_pages */ - pageblocks_stride = 1; - /* If the entire pageblock is free, move to the end of free page */ - if (pageblock_free(page)) - pageblocks_stride += page_order(page) - pageblock_order; + if (pageblock_free(page)) { + int order; + /* be careful. we don't have locks, page_order can be changed.*/ + order = page_order(page); + if ((order < MAX_ORDER) && (order >= pageblock_order)) + return page + (1 << order); + } - return page + (pageblocks_stride * pageblock_nr_pages); + return page + pageblock_nr_pages; } /* Checks if this range of memory is likely to be hot-removable. */ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { - int type; struct page *page = pfn_to_page(start_pfn); struct page *end_page = page + nr_pages; /* Check the starting page of each pageblock within the range */ for (; page < end_page; page = next_active_pageblock(page)) { - type = get_pageblock_migratetype(page); - - /* - * A pageblock containing MOVABLE or free pages is considered - * removable - */ - if (type != MIGRATE_MOVABLE && !pageblock_free(page)) - return 0; - - /* - * A pageblock starting with a PageReserved page is not - * considered removable. - */ - if (PageReserved(page)) + if (!is_pageblock_removable_nolock(page)) return 0; + cond_resched(); } /* All pageblocks in the memory block are likely to be hot-removable */ @@ -659,7 +646,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) * Scanning pfn is much easier than scanning lru list. * Scan pfn from start to end and Find LRU page. */ -int scan_lru_pages(unsigned long start, unsigned long end) +static unsigned long scan_lru_pages(unsigned long start, unsigned long end) { unsigned long pfn; struct page *page; @@ -709,29 +696,30 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) page_is_file_cache(page)); } else { - /* Becasue we don't have big zone->lock. we should - check this again here. */ - if (page_count(page)) - not_managed++; #ifdef CONFIG_DEBUG_VM printk(KERN_ALERT "removing pfn %lx from LRU failed\n", pfn); dump_page(page); #endif + /* Becasue we don't have big zone->lock. we should + check this again here. */ + if (page_count(page)) { + not_managed++; + ret = -EBUSY; + break; + } } } - ret = -EBUSY; - if (not_managed) { - if (!list_empty(&source)) + if (!list_empty(&source)) { + if (not_managed) { + putback_lru_pages(&source); + goto out; + } + /* this function returns # of failed pages */ + ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); + if (ret) putback_lru_pages(&source); - goto out; } - ret = 0; - if (list_empty(&source)) - goto out; - /* this function returns # of failed pages */ - ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); - out: return ret; } @@ -840,7 +828,6 @@ repeat: ret = 0; if (drain) { lru_add_drain_all(); - flush_scheduled_work(); cond_resched(); drain_all_pages(); } @@ -862,7 +849,6 @@ repeat: } /* drain all zone's lru pagevec, this is asyncronous... */ lru_add_drain_all(); - flush_scheduled_work(); yield(); /* drain pcp pages , this is synchrouns. */ drain_all_pages(); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5bc0a96beb51..4a57f135b76e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -924,15 +924,21 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodemask_t nmask; LIST_HEAD(pagelist); int err = 0; + struct vm_area_struct *vma; nodes_clear(nmask); node_set(source, nmask); - check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); + if (IS_ERR(vma)) + return PTR_ERR(vma); - if (!list_empty(&pagelist)) + if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, 0); + if (err) + putback_lru_pages(&pagelist); + } return err; } @@ -1147,9 +1153,12 @@ static long do_mbind(unsigned long start, unsigned long len, err = mbind_range(mm, start, end, new); - if (!list_empty(&pagelist)) + if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, (unsigned long)vma, 0); + if (nr_failed) + putback_lru_pages(&pagelist); + } if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; @@ -1275,33 +1284,42 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, const unsigned long __user *, new_nodes) { const struct cred *cred = current_cred(), *tcred; - struct mm_struct *mm; + struct mm_struct *mm = NULL; struct task_struct *task; - nodemask_t old; - nodemask_t new; nodemask_t task_nodes; int err; + nodemask_t *old; + nodemask_t *new; + NODEMASK_SCRATCH(scratch); - err = get_nodes(&old, old_nodes, maxnode); + if (!scratch) + return -ENOMEM; + + old = &scratch->mask1; + new = &scratch->mask2; + + err = get_nodes(old, old_nodes, maxnode); if (err) - return err; + goto out; - err = get_nodes(&new, new_nodes, maxnode); + err = get_nodes(new, new_nodes, maxnode); if (err) - return err; + goto out; /* Find the mm_struct */ read_lock(&tasklist_lock); task = pid ? find_task_by_vpid(pid) : current; if (!task) { read_unlock(&tasklist_lock); - return -ESRCH; + err = -ESRCH; + goto out; } mm = get_task_mm(task); read_unlock(&tasklist_lock); + err = -EINVAL; if (!mm) - return -EINVAL; + goto out; /* * Check if this process has the right to modify the specified @@ -1322,12 +1340,12 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ - if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { + if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; } - if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) { + if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { err = -EINVAL; goto out; } @@ -1336,10 +1354,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, if (err) goto out; - err = do_migrate_pages(mm, &old, &new, + err = do_migrate_pages(mm, old, new, capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); out: - mmput(mm); + if (mm) + mmput(mm); + NODEMASK_SCRATCH_FREE(scratch); + return err; } @@ -1576,7 +1597,7 @@ unsigned slab_node(struct mempolicy *policy) (void)first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes, &zone); - return zone->node; + return zone ? zone->node : numa_node_id(); } default: @@ -1712,6 +1733,50 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) } #endif +/* + * mempolicy_nodemask_intersects + * + * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default + * policy. Otherwise, check for intersection between mask and the policy + * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local' + * policy, always return true since it may allocate elsewhere on fallback. + * + * Takes task_lock(tsk) to prevent freeing of its mempolicy. + */ +bool mempolicy_nodemask_intersects(struct task_struct *tsk, + const nodemask_t *mask) +{ + struct mempolicy *mempolicy; + bool ret = true; + + if (!mask) + return ret; + task_lock(tsk); + mempolicy = tsk->mempolicy; + if (!mempolicy) + goto out; + + switch (mempolicy->mode) { + case MPOL_PREFERRED: + /* + * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to + * allocate from, they may fallback to other nodes when oom. + * Thus, it's possible for tsk to have allocated memory from + * nodes in mask. + */ + break; + case MPOL_BIND: + case MPOL_INTERLEAVE: + ret = nodes_intersects(mempolicy->v.nodes, *mask); + break; + default: + BUG(); + } +out: + task_unlock(tsk); + return ret; +} + /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, diff --git a/mm/migrate.c b/mm/migrate.c index 4205b1d6049e..fe5a3c6a5426 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -32,6 +32,7 @@ #include <linux/security.h> #include <linux/memcontrol.h> #include <linux/syscalls.h> +#include <linux/hugetlb.h> #include <linux/gfp.h> #include "internal.h" @@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, pte_t *ptep, pte; spinlock_t *ptl; - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) - goto out; + if (unlikely(PageHuge(new))) { + ptep = huge_pte_offset(mm, addr); + if (!ptep) + goto out; + ptl = &mm->page_table_lock; + } else { + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - goto out; + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - goto out; + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + goto out; - ptep = pte_offset_map(pmd, addr); + ptep = pte_offset_map(pmd, addr); - if (!is_swap_pte(*ptep)) { - pte_unmap(ptep); - goto out; - } + if (!is_swap_pte(*ptep)) { + pte_unmap(ptep); + goto out; + } + + ptl = pte_lockptr(mm, pmd); + } - ptl = pte_lockptr(mm, pmd); spin_lock(ptl); pte = *ptep; if (!is_swap_pte(pte)) @@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); if (is_write_migration_entry(entry)) pte = pte_mkwrite(pte); +#ifdef CONFIG_HUGETLB_PAGE + if (PageHuge(new)) + pte = pte_mkhuge(pte); +#endif flush_cache_page(vma, addr, pte_pfn(pte)); set_pte_at(mm, addr, ptep, pte); - if (PageAnon(new)) + if (PageHuge(new)) { + if (PageAnon(new)) + hugepage_add_anon_rmap(new, vma, addr); + else + page_dup_rmap(new); + } else if (PageAnon(new)) page_add_anon_rmap(new, vma, addr); else page_add_file_rmap(new); @@ -276,11 +294,59 @@ static int migrate_page_move_mapping(struct address_space *mapping, } /* + * The expected number of remaining references is the same as that + * of migrate_page_move_mapping(). + */ +int migrate_huge_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + int expected_count; + void **pslot; + + if (!mapping) { + if (page_count(page) != 1) + return -EAGAIN; + return 0; + } + + spin_lock_irq(&mapping->tree_lock); + + pslot = radix_tree_lookup_slot(&mapping->page_tree, + page_index(page)); + + expected_count = 2 + page_has_private(page); + if (page_count(page) != expected_count || + (struct page *)radix_tree_deref_slot(pslot) != page) { + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + if (!page_freeze_refs(page, expected_count)) { + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + get_page(newpage); + + radix_tree_replace_slot(pslot, newpage); + + page_unfreeze_refs(page, expected_count); + + __put_page(page); + + spin_unlock_irq(&mapping->tree_lock); + return 0; +} + +/* * Copy the page to its new location */ -static void migrate_page_copy(struct page *newpage, struct page *page) +void migrate_page_copy(struct page *newpage, struct page *page) { - copy_highpage(newpage, page); + if (PageHuge(page)) + copy_huge_page(newpage, page); + else + copy_highpage(newpage, page); if (PageError(page)) SetPageError(newpage); @@ -431,7 +497,6 @@ static int writeout(struct address_space *mapping, struct page *page) .nr_to_write = 1, .range_start = 0, .range_end = LLONG_MAX, - .nonblocking = 1, .for_reclaim = 1 }; int rc; @@ -639,7 +704,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * exist when the page is remapped later */ anon_vma = page_anon_vma(page); - atomic_inc(&anon_vma->external_refcount); + get_anon_vma(anon_vma); } } @@ -682,12 +747,8 @@ skip_unmap: rcu_unlock: /* Drop an anon_vma reference if we took one */ - if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { - int empty = list_empty(&anon_vma->head); - spin_unlock(&anon_vma->lock); - if (empty) - anon_vma_free(anon_vma); - } + if (anon_vma) + drop_anon_vma(anon_vma); if (rcu_locked) rcu_read_unlock(); @@ -728,6 +789,92 @@ move_newpage: } /* + * Counterpart of unmap_and_move_page() for hugepage migration. + * + * This function doesn't wait the completion of hugepage I/O + * because there is no race between I/O and migration for hugepage. + * Note that currently hugepage I/O occurs only in direct I/O + * where no lock is held and PG_writeback is irrelevant, + * and writeback status of all subpages are counted in the reference + * count of the head page (i.e. if all subpages of a 2MB hugepage are + * under direct I/O, the reference of the head page is 512 and a bit more.) + * This means that when we try to migrate hugepage whose subpages are + * doing direct I/O, some references remain after try_to_unmap() and + * hugepage migration fails without data corruption. + * + * There is also no race when direct I/O is issued on the page under migration, + * because then pte is replaced with migration swap entry and direct I/O code + * will wait in the page fault for migration to complete. + */ +static int unmap_and_move_huge_page(new_page_t get_new_page, + unsigned long private, struct page *hpage, + int force, int offlining) +{ + int rc = 0; + int *result = NULL; + struct page *new_hpage = get_new_page(hpage, private, &result); + int rcu_locked = 0; + struct anon_vma *anon_vma = NULL; + + if (!new_hpage) + return -ENOMEM; + + rc = -EAGAIN; + + if (!trylock_page(hpage)) { + if (!force) + goto out; + lock_page(hpage); + } + + if (PageAnon(hpage)) { + rcu_read_lock(); + rcu_locked = 1; + + if (page_mapped(hpage)) { + anon_vma = page_anon_vma(hpage); + atomic_inc(&anon_vma->external_refcount); + } + } + + try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + + if (!page_mapped(hpage)) + rc = move_to_new_page(new_hpage, hpage, 1); + + if (rc) + remove_migration_ptes(hpage, hpage); + + if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, + &anon_vma->lock)) { + int empty = list_empty(&anon_vma->head); + spin_unlock(&anon_vma->lock); + if (empty) + anon_vma_free(anon_vma); + } + + if (rcu_locked) + rcu_read_unlock(); +out: + unlock_page(hpage); + + if (rc != -EAGAIN) { + list_del(&hpage->lru); + put_page(hpage); + } + + put_page(new_hpage); + + if (result) { + if (rc) + *result = rc; + else + *result = page_to_nid(new_hpage); + } + return rc; +} + +/* * migrate_pages * * The function takes one list of pages to migrate and a function @@ -736,8 +883,9 @@ move_newpage: * * The function returns after 10 attempts or if no pages * are movable anymore because to has become empty - * or no retryable pages exist anymore. All pages will be - * returned to the LRU or freed. + * or no retryable pages exist anymore. + * Caller should call putback_lru_pages to return pages to the LRU + * or free list. * * Return: Number of pages not migrated or error code. */ @@ -784,7 +932,51 @@ out: if (!swapwrite) current->flags &= ~PF_SWAPWRITE; - putback_lru_pages(from); + if (rc) + return rc; + + return nr_failed + retry; +} + +int migrate_huge_pages(struct list_head *from, + new_page_t get_new_page, unsigned long private, int offlining) +{ + int retry = 1; + int nr_failed = 0; + int pass = 0; + struct page *page; + struct page *page2; + int rc; + + for (pass = 0; pass < 10 && retry; pass++) { + retry = 0; + + list_for_each_entry_safe(page, page2, from, lru) { + cond_resched(); + + rc = unmap_and_move_huge_page(get_new_page, + private, page, pass > 2, offlining); + + switch(rc) { + case -ENOMEM: + goto out; + case -EAGAIN: + retry++; + break; + case 0: + break; + default: + /* Permanent failure */ + nr_failed++; + break; + } + } + } + rc = 0; +out: + + list_for_each_entry_safe(page, page2, from, lru) + put_page(page); if (rc) return rc; @@ -845,7 +1037,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, err = -EFAULT; vma = find_vma(mm, pp->addr); - if (!vma || !vma_migratable(vma)) + if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) goto set_status; page = follow_page(vma, pp->addr, FOLL_GET); @@ -894,9 +1086,12 @@ set_status: } err = 0; - if (!list_empty(&pagelist)) + if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, (unsigned long)pm, 0); + if (err) + putback_lru_pages(&pagelist); + } up_read(&mm->mmap_sem); return err; @@ -1009,7 +1204,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, int err = -EFAULT; vma = find_vma(mm, addr); - if (!vma) + if (!vma || addr < vma->vm_start) goto set_status; page = follow_page(vma, addr, 0); diff --git a/mm/mlock.c b/mm/mlock.c index 3f82720e0515..b70919ce4f72 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -135,6 +135,13 @@ void munlock_vma_page(struct page *page) } } +static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) +{ + return (vma->vm_flags & VM_GROWSDOWN) && + (vma->vm_start == addr) && + !vma_stack_continue(vma->vm_prev, addr); +} + /** * __mlock_vma_pages_range() - mlock a range of pages in the vma. * @vma: target vma @@ -167,6 +174,12 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & VM_WRITE) gup_flags |= FOLL_WRITE; + /* We don't try to access the guard page of a stack vma */ + if (stack_guard_page(vma, start)) { + addr += PAGE_SIZE; + nr_pages--; + } + while (nr_pages > 0) { int i; diff --git a/mm/mmap.c b/mm/mmap.c index 456ec6f27889..b179abb1474a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -28,6 +28,7 @@ #include <linux/rmap.h> #include <linux/mmu_notifier.h> #include <linux/perf_event.h> +#include <linux/audit.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -388,17 +389,23 @@ static inline void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent) { + struct vm_area_struct *next; + + vma->vm_prev = prev; if (prev) { - vma->vm_next = prev->vm_next; + next = prev->vm_next; prev->vm_next = vma; } else { mm->mmap = vma; if (rb_parent) - vma->vm_next = rb_entry(rb_parent, + next = rb_entry(rb_parent, struct vm_area_struct, vm_rb); else - vma->vm_next = NULL; + next = NULL; } + vma->vm_next = next; + if (next) + next->vm_prev = vma; } void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, @@ -452,12 +459,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, spin_lock(&mapping->i_mmap_lock); vma->vm_truncate_count = mapping->truncate_count; } - anon_vma_lock(vma); __vma_link(mm, vma, prev, rb_link, rb_parent); __vma_link_file(vma); - anon_vma_unlock(vma); if (mapping) spin_unlock(&mapping->i_mmap_lock); @@ -485,7 +490,11 @@ static inline void __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev) { - prev->vm_next = vma->vm_next; + struct vm_area_struct *next = vma->vm_next; + + prev->vm_next = next; + if (next) + next->vm_prev = prev; rb_erase(&vma->vm_rb, &mm->mm_rb); if (mm->mmap_cache == vma) mm->mmap_cache = prev; @@ -506,6 +515,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *importer = NULL; struct address_space *mapping = NULL; struct prio_tree_root *root = NULL; + struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; long adjust_next = 0; int remove_next = 0; @@ -578,6 +588,17 @@ again: remove_next = 1 + (end > next->vm_end); } } + /* + * When changing only vma->vm_end, we don't really need anon_vma + * lock. This is a fairly rare case by itself, but the anon_vma + * lock may be shared between many sibling processes. Skipping + * the lock for brk adjustments makes a difference sometimes. + */ + if (vma->anon_vma && (insert || importer || start != vma->vm_start)) { + anon_vma = vma->anon_vma; + anon_vma_lock(anon_vma); + } + if (root) { flush_dcache_mmap_lock(mapping); vma_prio_tree_remove(vma, root); @@ -617,6 +638,8 @@ again: remove_next = 1 + (end > next->vm_end); __insert_vm_struct(mm, insert); } + if (anon_vma) + anon_vma_unlock(anon_vma); if (mapping) spin_unlock(&mapping->i_mmap_lock); @@ -1086,6 +1109,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long retval = -EBADF; if (!(flags & MAP_ANONYMOUS)) { + audit_mmap_fd(fd, flags); if (unlikely(flags & MAP_HUGETLB)) return -EINVAL; file = fget(fd); @@ -1694,9 +1718,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns * PA-RISC uses this for its stack; IA64 for its Register Backing Store. * vma is the last one with address > vma->vm_end. Have to extend vma. */ -#ifndef CONFIG_IA64 -static -#endif int expand_upwards(struct vm_area_struct *vma, unsigned long address) { int error; @@ -1710,7 +1731,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) */ if (unlikely(anon_vma_prepare(vma))) return -ENOMEM; - anon_vma_lock(vma); + vma_lock_anon_vma(vma); /* * vma->vm_start/vm_end cannot change under us because the caller @@ -1721,7 +1742,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (address < PAGE_ALIGN(address+4)) address = PAGE_ALIGN(address+4); else { - anon_vma_unlock(vma); + vma_unlock_anon_vma(vma); return -ENOMEM; } error = 0; @@ -1734,10 +1755,12 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) grow = (address - vma->vm_end) >> PAGE_SHIFT; error = acct_stack_growth(vma, size, grow); - if (!error) + if (!error) { vma->vm_end = address; + perf_event_mmap(vma); + } } - anon_vma_unlock(vma); + vma_unlock_anon_vma(vma); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -1762,7 +1785,7 @@ static int expand_downwards(struct vm_area_struct *vma, if (error) return error; - anon_vma_lock(vma); + vma_lock_anon_vma(vma); /* * vma->vm_start/vm_end cannot change under us because the caller @@ -1781,9 +1804,10 @@ static int expand_downwards(struct vm_area_struct *vma, if (!error) { vma->vm_start = address; vma->vm_pgoff -= grow; + perf_event_mmap(vma); } } - anon_vma_unlock(vma); + vma_unlock_anon_vma(vma); return error; } @@ -1900,6 +1924,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr; insertion_point = (prev ? &prev->vm_next : &mm->mmap); + vma->vm_prev = NULL; do { rb_erase(&vma->vm_rb, &mm->mm_rb); mm->map_count--; @@ -1907,6 +1932,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, vma = vma->vm_next; } while (vma && vma->vm_start < end); *insertion_point = vma; + if (vma) + vma->vm_prev = prev; tail_vma->vm_next = NULL; if (mm->unmap_area == arch_unmap_area) addr = prev ? prev->vm_end : mm->mmap_base; @@ -1984,6 +2011,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, removed_exe_file_vma(mm); fput(new->vm_file); } + unlink_anon_vmas(new); out_free_mpol: mpol_put(pol); out_free_vma: @@ -2208,6 +2236,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) vma->vm_page_prot = vm_get_page_prot(flags); vma_link(mm, vma, prev, rb_link, rb_parent); out: + perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) { if (!mlock_vma_pages_range(vma, addr, addr + len)) @@ -2466,23 +2495,23 @@ static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { - if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { + if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); + spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem); /* * We can safely modify head.next after taking the - * anon_vma->lock. If some other vma in this mm shares + * anon_vma->root->lock. If some other vma in this mm shares * the same anon_vma we won't take it again. * * No need of atomic instructions here, head.next * can't change from under us thanks to the - * anon_vma->lock. + * anon_vma->root->lock. */ if (__test_and_set_bit(0, (unsigned long *) - &anon_vma->head.next)) + &anon_vma->root->head.next)) BUG(); } } @@ -2573,7 +2602,7 @@ out_unlock: static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { - if (test_bit(0, (unsigned long *) &anon_vma->head.next)) { + if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { /* * The LSB of head.next can't change to 0 from under * us because we hold the mm_all_locks_mutex. @@ -2584,12 +2613,12 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) * * No need of atomic instructions here, head.next * can't change from under us until we release the - * anon_vma->lock. + * anon_vma->root->lock. */ if (!__test_and_clear_bit(0, (unsigned long *) - &anon_vma->head.next)) + &anon_vma->root->head.next)) BUG(); - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); } } diff --git a/mm/mmzone.c b/mm/mmzone.c index f5b7d1760213..e35bfb82c855 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -87,3 +87,24 @@ int memmap_valid_within(unsigned long pfn, return 1; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ + +#ifdef CONFIG_SMP +/* Called when a more accurate view of NR_FREE_PAGES is needed */ +unsigned long zone_nr_free_pages(struct zone *zone) +{ + unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); + + /* + * While kswapd is awake, it is considered the zone is under some + * memory pressure. Under pressure, there is a risk that + * per-cpu-counter-drift will allow the min watermark to be breached + * potentially causing a live-lock. While kswapd is awake and + * free pages are low, get a better estimate for free pages + */ + if (nr_free_pages < zone->percpu_drift_mark && + !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) + return zone_page_state_snapshot(zone, NR_FREE_PAGES); + + return nr_free_pages; +} +#endif /* CONFIG_SMP */ diff --git a/mm/mremap.c b/mm/mremap.c index cde56ee51ef7..563fbdd6293a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -101,7 +101,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * pte locks because exclusive mmap_sem prevents deadlock. */ old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); - new_pte = pte_offset_map_nested(new_pmd, new_addr); + new_pte = pte_offset_map(new_pmd, new_addr); new_ptl = pte_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); @@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, arch_leave_lazy_mmu_mode(); if (new_ptl != old_ptl) spin_unlock(new_ptl); - pte_unmap_nested(new_pte - 1); + pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) spin_unlock(&mapping->i_mmap_lock); diff --git a/mm/nommu.c b/mm/nommu.c index b76f3ee0abe0..3613517c7592 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -29,6 +29,7 @@ #include <linux/personality.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/audit.h> #include <asm/uaccess.h> #include <asm/tlb.h> @@ -36,11 +37,6 @@ #include <asm/mmu_context.h> #include "internal.h" -static inline __attribute__((format(printf, 1, 2))) -void no_printk(const char *fmt, ...) -{ -} - #if 0 #define kenter(FMT, ...) \ printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) @@ -298,11 +294,58 @@ void *vmalloc(unsigned long size) } EXPORT_SYMBOL(vmalloc); +/* + * vzalloc - allocate virtually continguos memory with zero fill + * + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into continguos kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); +} +EXPORT_SYMBOL(vzalloc); + +/** + * vmalloc_node - allocate memory on a specific node + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ void *vmalloc_node(unsigned long size, int node) { return vmalloc(size); } -EXPORT_SYMBOL(vmalloc_node); + +/** + * vzalloc_node - allocate memory on a specific node with zero fill + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc_node(unsigned long size, int node) +{ + return vzalloc(size); +} +EXPORT_SYMBOL(vzalloc_node); #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL @@ -609,7 +652,7 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags) */ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *pvma, **pp; + struct vm_area_struct *pvma, **pp, *next; struct address_space *mapping; struct rb_node **p, *parent; @@ -669,8 +712,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) break; } - vma->vm_next = *pp; + next = *pp; *pp = vma; + vma->vm_next = next; + if (next) + next->vm_prev = vma; } /* @@ -1413,6 +1459,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, struct file *file = NULL; unsigned long retval = -EBADF; + audit_mmap_fd(fd, flags); if (!(flags & MAP_ANONYMOUS)) { file = fget(fd); if (!file) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 709aedfaa014..7dcca55ede7c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -4,6 +4,8 @@ * Copyright (C) 1998,2000 Rik van Riel * Thanks go out to Claus Fischer for some serious inspiration and * for goading me into coding this file... + * Copyright (C) 2010 Google, Inc. + * Rewritten by David Rientjes * * The routines in this file are used to kill a process when * we're seriously out of memory. This gets called from __alloc_pages() @@ -27,171 +29,194 @@ #include <linux/module.h> #include <linux/notifier.h> #include <linux/memcontrol.h> +#include <linux/mempolicy.h> #include <linux/security.h> int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; -int sysctl_oom_dump_tasks; +int sysctl_oom_dump_tasks = 1; static DEFINE_SPINLOCK(zone_scan_lock); -/* #define DEBUG */ + +#ifdef CONFIG_NUMA +/** + * has_intersects_mems_allowed() - check task eligiblity for kill + * @tsk: task struct of which task to consider + * @mask: nodemask passed to page allocator for mempolicy ooms + * + * Task eligibility is determined by whether or not a candidate task, @tsk, + * shares the same mempolicy nodes as current if it is bound by such a policy + * and whether or not it has the same set of allowed cpuset nodes. + */ +static bool has_intersects_mems_allowed(struct task_struct *tsk, + const nodemask_t *mask) +{ + struct task_struct *start = tsk; + + do { + if (mask) { + /* + * If this is a mempolicy constrained oom, tsk's + * cpuset is irrelevant. Only return true if its + * mempolicy intersects current, otherwise it may be + * needlessly killed. + */ + if (mempolicy_nodemask_intersects(tsk, mask)) + return true; + } else { + /* + * This is not a mempolicy constrained oom, so only + * check the mems of tsk's cpuset. + */ + if (cpuset_mems_allowed_intersects(current, tsk)) + return true; + } + } while_each_thread(start, tsk); + + return false; +} +#else +static bool has_intersects_mems_allowed(struct task_struct *tsk, + const nodemask_t *mask) +{ + return true; +} +#endif /* CONFIG_NUMA */ /* - * Is all threads of the target process nodes overlap ours? + * If this is a system OOM (not a memcg OOM) and the task selected to be + * killed is not already running at high (RT) priorities, speed up the + * recovery by boosting the dying task to the lowest FIFO priority. + * That helps with the recovery and avoids interfering with RT tasks. */ -static int has_intersects_mems_allowed(struct task_struct *tsk) +static void boost_dying_task_prio(struct task_struct *p, + struct mem_cgroup *mem) { - struct task_struct *t; + struct sched_param param = { .sched_priority = 1 }; + + if (mem) + return; + + if (!rt_task(p)) + sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); +} + +/* + * The process p may have detached its own ->mm while exiting or through + * use_mm(), but one or more of its subthreads may still have a valid + * pointer. Return p, or any of its subthreads with a valid ->mm, with + * task_lock() held. + */ +struct task_struct *find_lock_task_mm(struct task_struct *p) +{ + struct task_struct *t = p; - t = tsk; do { - if (cpuset_mems_allowed_intersects(current, t)) - return 1; - t = next_thread(t); - } while (t != tsk); + task_lock(t); + if (likely(t->mm)) + return t; + task_unlock(t); + } while_each_thread(p, t); - return 0; + return NULL; +} + +/* return true if the task is not adequate as candidate victim task. */ +static bool oom_unkillable_task(struct task_struct *p, + const struct mem_cgroup *mem, const nodemask_t *nodemask) +{ + if (is_global_init(p)) + return true; + if (p->flags & PF_KTHREAD) + return true; + + /* When mem_cgroup_out_of_memory() and p is not member of the group */ + if (mem && !task_in_mem_cgroup(p, mem)) + return true; + + /* p may not have freeable memory in nodemask */ + if (!has_intersects_mems_allowed(p, nodemask)) + return true; + + return false; } /** - * badness - calculate a numeric value for how bad this task has been + * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate - * @uptime: current uptime in seconds - * - * The formula used is relatively simple and documented inline in the - * function. The main rationale is that we want to select a good task - * to kill when we run out of memory. + * @totalpages: total present RAM allowed for page allocation * - * Good in this context means that: - * 1) we lose the minimum amount of work done - * 2) we recover a large amount of memory - * 3) we don't kill anything innocent of eating tons of memory - * 4) we want to kill the minimum amount of processes (one) - * 5) we try to kill the process the user expects us to kill, this - * algorithm has been meticulously tuned to meet the principle - * of least surprise ... (be careful when you change it) + * The heuristic for determining which task to kill is made to be as simple and + * predictable as possible. The goal is to return the highest value for the + * task consuming the most memory to avoid subsequent oom failures. */ - -unsigned long badness(struct task_struct *p, unsigned long uptime) +unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, + const nodemask_t *nodemask, unsigned long totalpages) { - unsigned long points, cpu_time, run_time; - struct mm_struct *mm; - struct task_struct *child; - int oom_adj = p->signal->oom_adj; - struct task_cputime task_time; - unsigned long utime; - unsigned long stime; + int points; - if (oom_adj == OOM_DISABLE) + if (oom_unkillable_task(p, mem, nodemask)) return 0; - task_lock(p); - mm = p->mm; - if (!mm) { - task_unlock(p); + p = find_lock_task_mm(p); + if (!p) return 0; - } - - /* - * The memory size of the process is the basis for the badness. - */ - points = mm->total_vm; /* - * After this unlock we can no longer dereference local variable `mm' + * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN + * so the entire heuristic doesn't need to be executed for something + * that cannot be killed. */ - task_unlock(p); - - /* - * swapoff can easily use up all memory, so kill those first. - */ - if (p->flags & PF_OOM_ORIGIN) - return ULONG_MAX; - - /* - * Processes which fork a lot of child processes are likely - * a good choice. We add half the vmsize of the children if they - * have an own mm. This prevents forking servers to flood the - * machine with an endless amount of children. In case a single - * child is eating the vast majority of memory, adding only half - * to the parents will make the child our kill candidate of choice. - */ - list_for_each_entry(child, &p->children, sibling) { - task_lock(child); - if (child->mm != mm && child->mm) - points += child->mm->total_vm/2 + 1; - task_unlock(child); + if (atomic_read(&p->mm->oom_disable_count)) { + task_unlock(p); + return 0; } /* - * CPU time is in tens of seconds and run time is in thousands - * of seconds. There is no particular reason for this other than - * that it turned out to work very well in practice. + * When the PF_OOM_ORIGIN bit is set, it indicates the task should have + * priority for oom killing. */ - thread_group_cputime(p, &task_time); - utime = cputime_to_jiffies(task_time.utime); - stime = cputime_to_jiffies(task_time.stime); - cpu_time = (utime + stime) >> (SHIFT_HZ + 3); - - - if (uptime >= p->start_time.tv_sec) - run_time = (uptime - p->start_time.tv_sec) >> 10; - else - run_time = 0; - - if (cpu_time) - points /= int_sqrt(cpu_time); - if (run_time) - points /= int_sqrt(int_sqrt(run_time)); + if (p->flags & PF_OOM_ORIGIN) { + task_unlock(p); + return 1000; + } /* - * Niced processes are most likely less important, so double - * their badness points. + * The memory controller may have a limit of 0 bytes, so avoid a divide + * by zero, if necessary. */ - if (task_nice(p) > 0) - points *= 2; + if (!totalpages) + totalpages = 1; /* - * Superuser processes are usually more important, so we make it - * less likely that we kill those. + * The baseline for the badness score is the proportion of RAM that each + * task's rss and swap space use. */ - if (has_capability_noaudit(p, CAP_SYS_ADMIN) || - has_capability_noaudit(p, CAP_SYS_RESOURCE)) - points /= 4; + points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 / + totalpages; + task_unlock(p); /* - * We don't want to kill a process with direct hardware access. - * Not only could that mess up the hardware, but usually users - * tend to only have this flag set on applications they think - * of as important. + * Root processes get 3% bonus, just like the __vm_enough_memory() + * implementation used by LSMs. */ - if (has_capability_noaudit(p, CAP_SYS_RAWIO)) - points /= 4; + if (has_capability_noaudit(p, CAP_SYS_ADMIN)) + points -= 30; /* - * If p's nodes don't overlap ours, it may still help to kill p - * because p may have allocated or otherwise mapped memory on - * this node before. However it will be less likely. + * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may + * either completely disable oom killing or always prefer a certain + * task. */ - if (!has_intersects_mems_allowed(p)) - points /= 8; + points += p->signal->oom_score_adj; /* - * Adjust the score by oom_adj. + * Never return 0 for an eligible task that may be killed since it's + * possible that no single user task uses more than 0.1% of memory and + * no single admin tasks uses more than 3.0%. */ - if (oom_adj) { - if (oom_adj > 0) { - if (!points) - points = 1; - points <<= oom_adj; - } else - points >>= -(oom_adj); - } - -#ifdef DEBUG - printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n", - p->pid, p->comm, points); -#endif - return points; + if (points <= 0) + return 1; + return (points < 1000) ? points : 1000; } /* @@ -199,12 +224,20 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) */ #ifdef CONFIG_NUMA static enum oom_constraint constrained_alloc(struct zonelist *zonelist, - gfp_t gfp_mask, nodemask_t *nodemask) + gfp_t gfp_mask, nodemask_t *nodemask, + unsigned long *totalpages) { struct zone *zone; struct zoneref *z; enum zone_type high_zoneidx = gfp_zone(gfp_mask); + bool cpuset_limited = false; + int nid; + /* Default to all available memory */ + *totalpages = totalram_pages + total_swap_pages; + + if (!zonelist) + return CONSTRAINT_NONE; /* * Reach here only when __GFP_NOFAIL is used. So, we should avoid * to kill current.We have to random task kill in this case. @@ -214,26 +247,37 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, return CONSTRAINT_NONE; /* - * The nodemask here is a nodemask passed to alloc_pages(). Now, - * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy - * feature. mempolicy is an only user of nodemask here. - * check mempolicy's nodemask contains all N_HIGH_MEMORY + * This is not a __GFP_THISNODE allocation, so a truncated nodemask in + * the page allocator means a mempolicy is in effect. Cpuset policy + * is enforced in get_page_from_freelist(). */ - if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) + if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) { + *totalpages = total_swap_pages; + for_each_node_mask(nid, *nodemask) + *totalpages += node_spanned_pages(nid); return CONSTRAINT_MEMORY_POLICY; + } /* Check this allocation failure is caused by cpuset's wall function */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) - return CONSTRAINT_CPUSET; + cpuset_limited = true; + if (cpuset_limited) { + *totalpages = total_swap_pages; + for_each_node_mask(nid, cpuset_current_mems_allowed) + *totalpages += node_spanned_pages(nid); + return CONSTRAINT_CPUSET; + } return CONSTRAINT_NONE; } #else static enum oom_constraint constrained_alloc(struct zonelist *zonelist, - gfp_t gfp_mask, nodemask_t *nodemask) + gfp_t gfp_mask, nodemask_t *nodemask, + unsigned long *totalpages) { + *totalpages = totalram_pages + total_swap_pages; return CONSTRAINT_NONE; } #endif @@ -244,28 +288,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, * * (not docbooked, we don't want this one cluttering up the manual) */ -static struct task_struct *select_bad_process(unsigned long *ppoints, - struct mem_cgroup *mem) +static struct task_struct *select_bad_process(unsigned int *ppoints, + unsigned long totalpages, struct mem_cgroup *mem, + const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *chosen = NULL; - struct timespec uptime; *ppoints = 0; - do_posix_clock_monotonic_gettime(&uptime); for_each_process(p) { - unsigned long points; + unsigned int points; - /* - * skip kernel threads and tasks which have already released - * their mm. - */ - if (!p->mm) - continue; - /* skip the init task */ - if (is_global_init(p)) - continue; - if (mem && !task_in_mem_cgroup(p, mem)) + if (oom_unkillable_task(p, mem, nodemask)) continue; /* @@ -290,19 +324,16 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, * the process of exiting and releasing its resources. * Otherwise we could get an easy OOM deadlock. */ - if (p->flags & PF_EXITING) { + if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) { if (p != current) return ERR_PTR(-1UL); chosen = p; - *ppoints = ULONG_MAX; + *ppoints = 1000; } - if (p->signal->oom_adj == OOM_DISABLE) - continue; - - points = badness(p, uptime.tv_sec); - if (points > *ppoints || !chosen) { + points = oom_badness(p, mem, nodemask, totalpages); + if (points > *ppoints) { chosen = p; *ppoints = points; } @@ -313,176 +344,208 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, /** * dump_tasks - dump current memory state of all system tasks - * @mem: target memory controller + * @mem: current's memory controller, if constrained + * @nodemask: nodemask passed to page allocator for mempolicy ooms * - * Dumps the current memory state of all system tasks, excluding kernel threads. + * Dumps the current memory state of all eligible tasks. Tasks not in the same + * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes + * are not shown. * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj - * score, and name. - * - * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are - * shown. + * value, oom_score_adj value, and name. * * Call with tasklist_lock read-locked. */ -static void dump_tasks(const struct mem_cgroup *mem) +static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) { - struct task_struct *g, *p; - - printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " - "name\n"); - do_each_thread(g, p) { - struct mm_struct *mm; + struct task_struct *p; + struct task_struct *task; - if (mem && !task_in_mem_cgroup(p, mem)) - continue; - if (!thread_group_leader(p)) + pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); + for_each_process(p) { + if (oom_unkillable_task(p, mem, nodemask)) continue; - task_lock(p); - mm = p->mm; - if (!mm) { + task = find_lock_task_mm(p); + if (!task) { /* - * total_vm and rss sizes do not exist for tasks with no - * mm so there's no need to report them; they can't be - * oom killed anyway. + * This is a kthread or all of p's threads have already + * detached their mm's. There's no need to report + * them; they can't be oom killed anyway. */ - task_unlock(p); continue; } - printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", - p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, - get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj, - p->comm); - task_unlock(p); - } while_each_thread(g, p); + + pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", + task->pid, task_uid(task), task->tgid, + task->mm->total_vm, get_mm_rss(task->mm), + task_cpu(task), task->signal->oom_adj, + task->signal->oom_score_adj, task->comm); + task_unlock(task); + } } static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, - struct mem_cgroup *mem) + struct mem_cgroup *mem, const nodemask_t *nodemask) { - pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " - "oom_adj=%d\n", - current->comm, gfp_mask, order, current->signal->oom_adj); task_lock(current); + pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " + "oom_adj=%d, oom_score_adj=%d\n", + current->comm, gfp_mask, order, current->signal->oom_adj, + current->signal->oom_score_adj); cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); mem_cgroup_print_oom_info(mem, p); show_mem(); if (sysctl_oom_dump_tasks) - dump_tasks(mem); + dump_tasks(mem, nodemask); } #define K(x) ((x) << (PAGE_SHIFT-10)) - -/* - * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO - * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO - * set. - */ -static void __oom_kill_task(struct task_struct *p, int verbose) +static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) { - if (is_global_init(p)) { - WARN_ON(1); - printk(KERN_WARNING "tried to kill init!\n"); - return; - } + struct task_struct *q; + struct mm_struct *mm; - task_lock(p); - if (!p->mm) { - WARN_ON(1); - printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n", - task_pid_nr(p), p->comm); - task_unlock(p); - return; - } + p = find_lock_task_mm(p); + if (!p) + return 1; + + /* mm cannot be safely dereferenced after task_unlock(p) */ + mm = p->mm; - if (verbose) - printk(KERN_ERR "Killed process %d (%s) " - "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", - task_pid_nr(p), p->comm, - K(p->mm->total_vm), - K(get_mm_counter(p->mm, MM_ANONPAGES)), - K(get_mm_counter(p->mm, MM_FILEPAGES))); + pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", + task_pid_nr(p), p->comm, K(p->mm->total_vm), + K(get_mm_counter(p->mm, MM_ANONPAGES)), + K(get_mm_counter(p->mm, MM_FILEPAGES))); task_unlock(p); /* - * We give our sacrificial lamb high priority and access to - * all the memory it needs. That way it should be able to - * exit() and clear out its resources quickly... + * Kill all processes sharing p->mm in other thread groups, if any. + * They don't get access to memory reserves or a higher scheduler + * priority, though, to avoid depletion of all memory or task + * starvation. This prevents mm->mmap_sem livelock when an oom killed + * task cannot exit because it requires the semaphore and its contended + * by another thread trying to allocate memory itself. That thread will + * now get access to memory reserves since it has a pending fatal + * signal. */ - p->rt.time_slice = HZ; - set_tsk_thread_flag(p, TIF_MEMDIE); + for_each_process(q) + if (q->mm == mm && !same_thread_group(q, p)) { + task_lock(q); /* Protect ->comm from prctl() */ + pr_err("Kill process %d (%s) sharing same memory\n", + task_pid_nr(q), q->comm); + task_unlock(q); + force_sig(SIGKILL, q); + } + set_tsk_thread_flag(p, TIF_MEMDIE); force_sig(SIGKILL, p); -} -static int oom_kill_task(struct task_struct *p) -{ - /* WARNING: mm may not be dereferenced since we did not obtain its - * value from get_task_mm(p). This is OK since all we need to do is - * compare mm to q->mm below. - * - * Furthermore, even if mm contains a non-NULL value, p->mm may - * change to NULL at any time since we do not hold task_lock(p). - * However, this is of no concern to us. + /* + * We give our sacrificial lamb high priority and access to + * all the memory it needs. That way it should be able to + * exit() and clear out its resources quickly... */ - if (!p->mm || p->signal->oom_adj == OOM_DISABLE) - return 1; - - __oom_kill_task(p, 1); + boost_dying_task_prio(p, mem); return 0; } +#undef K static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, - unsigned long points, struct mem_cgroup *mem, + unsigned int points, unsigned long totalpages, + struct mem_cgroup *mem, nodemask_t *nodemask, const char *message) { - struct task_struct *c; + struct task_struct *victim = p; + struct task_struct *child; + struct task_struct *t = p; + unsigned int victim_points = 0; if (printk_ratelimit()) - dump_header(p, gfp_mask, order, mem); + dump_header(p, gfp_mask, order, mem, nodemask); /* * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly */ if (p->flags & PF_EXITING) { - __oom_kill_task(p, 0); + set_tsk_thread_flag(p, TIF_MEMDIE); + boost_dying_task_prio(p, mem); return 0; } - printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", - message, task_pid_nr(p), p->comm, points); + task_lock(p); + pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", + message, task_pid_nr(p), p->comm, points); + task_unlock(p); + + /* + * If any of p's children has a different mm and is eligible for kill, + * the one with the highest badness() score is sacrificed for its + * parent. This attempts to lose the minimal amount of work done while + * still freeing memory. + */ + do { + list_for_each_entry(child, &t->children, sibling) { + unsigned int child_points; - /* Try to kill a child first */ - list_for_each_entry(c, &p->children, sibling) { - if (c->mm == p->mm) - continue; - if (mem && !task_in_mem_cgroup(c, mem)) - continue; - if (!oom_kill_task(c)) - return 0; + /* + * oom_badness() returns 0 if the thread is unkillable + */ + child_points = oom_badness(child, mem, nodemask, + totalpages); + if (child_points > victim_points) { + victim = child; + victim_points = child_points; + } + } + } while_each_thread(p, t); + + return oom_kill_task(victim, mem); +} + +/* + * Determines whether the kernel must panic because of the panic_on_oom sysctl. + */ +static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, + int order, const nodemask_t *nodemask) +{ + if (likely(!sysctl_panic_on_oom)) + return; + if (sysctl_panic_on_oom != 2) { + /* + * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel + * does not panic for cpuset, mempolicy, or memcg allocation + * failures. + */ + if (constraint != CONSTRAINT_NONE) + return; } - return oom_kill_task(p); + read_lock(&tasklist_lock); + dump_header(NULL, gfp_mask, order, NULL, nodemask); + read_unlock(&tasklist_lock); + panic("Out of memory: %s panic_on_oom is enabled\n", + sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } #ifdef CONFIG_CGROUP_MEM_RES_CTLR void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) { - unsigned long points = 0; + unsigned long limit; + unsigned int points = 0; struct task_struct *p; - if (sysctl_panic_on_oom == 2) - panic("out of memory(memcg). panic_on_oom is selected.\n"); + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); + limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; read_lock(&tasklist_lock); retry: - p = select_bad_process(&points, mem); + p = select_bad_process(&points, limit, mem, NULL); if (!p || PTR_ERR(p) == -1UL) goto out; - if (oom_kill_process(p, gfp_mask, 0, points, mem, + if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL, "Memory cgroup out of memory")) goto retry; out: @@ -509,7 +572,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier); * if a parallel OOM killing is already taking place that includes a zone in * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. */ -int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) +int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) { struct zoneref *z; struct zone *zone; @@ -526,7 +589,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { /* * Lock each zone in the zonelist under zone_scan_lock so a - * parallel invocation of try_set_zone_oom() doesn't succeed + * parallel invocation of try_set_zonelist_oom() doesn't succeed * when it shouldn't. */ zone_set_flag(zone, ZONE_OOM_LOCKED); @@ -555,65 +618,40 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) } /* - * Must be called with tasklist_lock held for read. + * Try to acquire the oom killer lock for all system zones. Returns zero if a + * parallel oom killing is taking place, otherwise locks all zones and returns + * non-zero. */ -static void __out_of_memory(gfp_t gfp_mask, int order) +static int try_set_system_oom(void) { - struct task_struct *p; - unsigned long points; - - if (sysctl_oom_kill_allocating_task) - if (!oom_kill_process(current, gfp_mask, order, 0, NULL, - "Out of memory (oom_kill_allocating_task)")) - return; -retry: - /* - * Rambo mode: Shoot down a process and hope it solves whatever - * issues we may have. - */ - p = select_bad_process(&points, NULL); - - if (PTR_ERR(p) == -1UL) - return; - - /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p) { - read_unlock(&tasklist_lock); - dump_header(NULL, gfp_mask, order, NULL); - panic("Out of memory and no killable processes...\n"); - } + struct zone *zone; + int ret = 1; - if (oom_kill_process(p, gfp_mask, order, points, NULL, - "Out of memory")) - goto retry; + spin_lock(&zone_scan_lock); + for_each_populated_zone(zone) + if (zone_is_oom_locked(zone)) { + ret = 0; + goto out; + } + for_each_populated_zone(zone) + zone_set_flag(zone, ZONE_OOM_LOCKED); +out: + spin_unlock(&zone_scan_lock); + return ret; } /* - * pagefault handler calls into here because it is out of memory but - * doesn't know exactly how or why. + * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation + * attempts or page faults may now recall the oom killer, if necessary. */ -void pagefault_out_of_memory(void) +static void clear_system_oom(void) { - unsigned long freed = 0; - - blocking_notifier_call_chain(&oom_notify_list, 0, &freed); - if (freed > 0) - /* Got some memory back in the last second. */ - return; - - if (sysctl_panic_on_oom) - panic("out of memory from page fault. panic_on_oom is selected.\n"); - - read_lock(&tasklist_lock); - __out_of_memory(0, 0); /* unknown gfp_mask and order */ - read_unlock(&tasklist_lock); + struct zone *zone; - /* - * Give "p" a good chance of killing itself before we - * retry to allocate memory. - */ - if (!test_thread_flag(TIF_MEMDIE)) - schedule_timeout_uninterruptible(1); + spin_lock(&zone_scan_lock); + for_each_populated_zone(zone) + zone_clear_flag(zone, ZONE_OOM_LOCKED); + spin_unlock(&zone_scan_lock); } /** @@ -621,6 +659,7 @@ void pagefault_out_of_memory(void) * @zonelist: zonelist pointer * @gfp_mask: memory allocation flags * @order: amount of memory being requested as a power of 2 + * @nodemask: nodemask passed to page allocator * * If we run out of memory, we have the choice between either * killing a random task (bad), letting the system crash (worse) @@ -630,49 +669,93 @@ void pagefault_out_of_memory(void) void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *nodemask) { + const nodemask_t *mpol_mask; + struct task_struct *p; + unsigned long totalpages; unsigned long freed = 0; - enum oom_constraint constraint; + unsigned int points; + enum oom_constraint constraint = CONSTRAINT_NONE; + int killed = 0; blocking_notifier_call_chain(&oom_notify_list, 0, &freed); if (freed > 0) /* Got some memory back in the last second. */ return; - if (sysctl_panic_on_oom == 2) { - dump_header(NULL, gfp_mask, order, NULL); - panic("out of memory. Compulsory panic_on_oom is selected.\n"); + /* + * If current has a pending SIGKILL, then automatically select it. The + * goal is to allow it to allocate so that it may quickly exit and free + * its memory. + */ + if (fatal_signal_pending(current)) { + set_thread_flag(TIF_MEMDIE); + boost_dying_task_prio(current, NULL); + return; } /* * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ - constraint = constrained_alloc(zonelist, gfp_mask, nodemask); + constraint = constrained_alloc(zonelist, gfp_mask, nodemask, + &totalpages); + mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; + check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); + read_lock(&tasklist_lock); + if (sysctl_oom_kill_allocating_task && + !oom_unkillable_task(current, NULL, nodemask) && + current->mm && !atomic_read(¤t->mm->oom_disable_count)) { + /* + * oom_kill_process() needs tasklist_lock held. If it returns + * non-zero, current could not be killed so we must fallback to + * the tasklist scan. + */ + if (!oom_kill_process(current, gfp_mask, order, 0, totalpages, + NULL, nodemask, + "Out of memory (oom_kill_allocating_task)")) + goto out; + } - switch (constraint) { - case CONSTRAINT_MEMORY_POLICY: - oom_kill_process(current, gfp_mask, order, 0, NULL, - "No available memory (MPOL_BIND)"); - break; +retry: + p = select_bad_process(&points, totalpages, NULL, mpol_mask); + if (PTR_ERR(p) == -1UL) + goto out; - case CONSTRAINT_NONE: - if (sysctl_panic_on_oom) { - dump_header(NULL, gfp_mask, order, NULL); - panic("out of memory. panic_on_oom is selected\n"); - } - /* Fall-through */ - case CONSTRAINT_CPUSET: - __out_of_memory(gfp_mask, order); - break; + /* Found nothing?!?! Either we hang forever, or we panic. */ + if (!p) { + dump_header(NULL, gfp_mask, order, NULL, mpol_mask); + read_unlock(&tasklist_lock); + panic("Out of memory and no killable processes...\n"); } + if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, + nodemask, "Out of memory")) + goto retry; + killed = 1; +out: read_unlock(&tasklist_lock); /* * Give "p" a good chance of killing itself before we * retry to allocate memory unless "p" is current */ + if (killed && !test_thread_flag(TIF_MEMDIE)) + schedule_timeout_uninterruptible(1); +} + +/* + * The pagefault handler calls here because it is out of memory, so kill a + * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel + * oom killing is already in progress so do nothing. If a task is found with + * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit. + */ +void pagefault_out_of_memory(void) +{ + if (try_set_system_oom()) { + out_of_memory(NULL, 0, 0, NULL); + clear_system_oom(); + } if (!test_thread_flag(TIF_MEMDIE)) schedule_timeout_uninterruptible(1); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 37498ef61548..b840afa89761 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -34,6 +34,7 @@ #include <linux/syscalls.h> #include <linux/buffer_head.h> #include <linux/pagevec.h> +#include <trace/events/writeback.h> /* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited @@ -252,32 +253,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, } } -/* - * Clip the earned share of dirty pages to that which is actually available. - * This avoids exceeding the total dirty_limit when the floating averages - * fluctuate too quickly. - */ -static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, - unsigned long dirty, unsigned long *pbdi_dirty) -{ - unsigned long avail_dirty; - - avail_dirty = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_WRITEBACK) + - global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK_TEMP); - - if (avail_dirty < dirty) - avail_dirty = dirty - avail_dirty; - else - avail_dirty = 0; - - avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + - bdi_stat(bdi, BDI_WRITEBACK); - - *pbdi_dirty = min(*pbdi_dirty, avail_dirty); -} - static inline void task_dirties_fraction(struct task_struct *tsk, long *numerator, long *denominator) { @@ -286,16 +261,24 @@ static inline void task_dirties_fraction(struct task_struct *tsk, } /* - * scale the dirty limit + * task_dirty_limit - scale down dirty throttling threshold for one task * * task specific dirty limit: * * dirty -= (dirty/8) * p_{t} + * + * To protect light/slow dirtying tasks from heavier/fast ones, we start + * throttling individual tasks before reaching the bdi dirty limit. + * Relatively low thresholds will be allocated to heavy dirtiers. So when + * dirty pages grow large, heavy dirtiers will be throttled first, which will + * effectively curb the growth of dirty pages. Light dirtiers with high enough + * dirty threshold may never get throttled. */ -static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) +static unsigned long task_dirty_limit(struct task_struct *tsk, + unsigned long bdi_dirty) { long numerator, denominator; - unsigned long dirty = *pdirty; + unsigned long dirty = bdi_dirty; u64 inv = dirty >> 3; task_dirties_fraction(tsk, &numerator, &denominator); @@ -303,10 +286,8 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) do_div(inv, denominator); dirty -= inv; - if (dirty < *pdirty/2) - dirty = *pdirty/2; - *pdirty = dirty; + return max(dirty, bdi_dirty/2); } /* @@ -416,9 +397,16 @@ unsigned long determine_dirtyable_memory(void) return x + 1; /* Ensure that we never return 0 */ } -void -get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, - unsigned long *pbdi_dirty, struct backing_dev_info *bdi) +/* + * global_dirty_limits - background-writeback and dirty-throttling thresholds + * + * Calculate the dirty thresholds based on sysctl parameters + * - vm.dirty_background_ratio or vm.dirty_background_bytes + * - vm.dirty_ratio or vm.dirty_bytes + * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and + * runtime tasks. + */ +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { unsigned long background; unsigned long dirty; @@ -427,14 +415,8 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); - else { - int dirty_ratio; - - dirty_ratio = vm_dirty_ratio; - if (dirty_ratio < 5) - dirty_ratio = 5; - dirty = (dirty_ratio * available_memory) / 100; - } + else + dirty = (vm_dirty_ratio * available_memory) / 100; if (dirty_background_bytes) background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); @@ -450,27 +432,37 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, } *pbackground = background; *pdirty = dirty; +} - if (bdi) { - u64 bdi_dirty; - long numerator, denominator; +/* + * bdi_dirty_limit - @bdi's share of dirty throttling threshold + * + * Allocate high/low dirty limits to fast/slow devices, in order to prevent + * - starving fast devices + * - piling up dirty pages (that will take long time to sync) on slow devices + * + * The bdi's share of dirty limit will be adapting to its throughput and + * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. + */ +unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) +{ + u64 bdi_dirty; + long numerator, denominator; - /* - * Calculate this BDI's share of the dirty ratio. - */ - bdi_writeout_fraction(bdi, &numerator, &denominator); - - bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; - bdi_dirty *= numerator; - do_div(bdi_dirty, denominator); - bdi_dirty += (dirty * bdi->min_ratio) / 100; - if (bdi_dirty > (dirty * bdi->max_ratio) / 100) - bdi_dirty = dirty * bdi->max_ratio / 100; - - *pbdi_dirty = bdi_dirty; - clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); - task_dirty_limit(current, pbdi_dirty); - } + /* + * Calculate this BDI's share of the dirty ratio. + */ + bdi_writeout_fraction(bdi, &numerator, &denominator); + + bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; + bdi_dirty *= numerator; + do_div(bdi_dirty, denominator); + + bdi_dirty += (dirty * bdi->min_ratio) / 100; + if (bdi_dirty > (dirty * bdi->max_ratio) / 100) + bdi_dirty = dirty * bdi->max_ratio / 100; + + return bdi_dirty; } /* @@ -490,7 +482,7 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long bdi_thresh; unsigned long pages_written = 0; unsigned long pause = 1; - + bool dirty_exceeded = false; struct backing_dev_info *bdi = mapping->backing_dev_info; for (;;) { @@ -501,46 +493,23 @@ static void balance_dirty_pages(struct address_space *mapping, .range_cyclic = 1, }; - get_dirty_limits(&background_thresh, &dirty_thresh, - &bdi_thresh, bdi); - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); nr_writeback = global_page_state(NR_WRITEBACK); - bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); - - if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) - break; + global_dirty_limits(&background_thresh, &dirty_thresh); /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts * when the bdi limits are ramping up. */ - if (nr_reclaimable + nr_writeback < + if (nr_reclaimable + nr_writeback <= (background_thresh + dirty_thresh) / 2) break; - if (!bdi->dirty_exceeded) - bdi->dirty_exceeded = 1; - - /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. - * Unstable writes are a feature of certain networked - * filesystems (i.e. NFS) in which data may have been - * written to the server's write cache, but has not yet - * been flushed to permanent storage. - * Only move pages to writeback if this bdi is over its - * threshold otherwise wait until the disk writes catch - * up. - */ - if (bdi_nr_reclaimable > bdi_thresh) { - writeback_inodes_wb(&bdi->wb, &wbc); - pages_written += write_chunk - wbc.nr_to_write; - get_dirty_limits(&background_thresh, &dirty_thresh, - &bdi_thresh, bdi); - } + bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + bdi_thresh = task_dirty_limit(current, bdi_thresh); /* * In order to avoid the stacked BDI deadlock we need @@ -555,16 +524,45 @@ static void balance_dirty_pages(struct address_space *mapping, if (bdi_thresh < 2*bdi_stat_error(bdi)) { bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); - } else if (bdi_nr_reclaimable) { + } else { bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); } - if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) + /* + * The bdi thresh is somehow "soft" limit derived from the + * global "hard" limit. The former helps to prevent heavy IO + * bdi or process from holding back light ones; The latter is + * the last resort safeguard. + */ + dirty_exceeded = + (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) + || (nr_reclaimable + nr_writeback > dirty_thresh); + + if (!dirty_exceeded) break; - if (pages_written >= write_chunk) - break; /* We've done our duty */ + if (!bdi->dirty_exceeded) + bdi->dirty_exceeded = 1; + + /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. + * Unstable writes are a feature of certain networked + * filesystems (i.e. NFS) in which data may have been + * written to the server's write cache, but has not yet + * been flushed to permanent storage. + * Only move pages to writeback if this bdi is over its + * threshold otherwise wait until the disk writes catch + * up. + */ + trace_wbc_balance_dirty_start(&wbc, bdi); + if (bdi_nr_reclaimable > bdi_thresh) { + writeback_inodes_wb(&bdi->wb, &wbc); + pages_written += write_chunk - wbc.nr_to_write; + trace_wbc_balance_dirty_written(&wbc, bdi); + if (pages_written >= write_chunk) + break; /* We've done our duty */ + } + trace_wbc_balance_dirty_wait(&wbc, bdi); __set_current_state(TASK_INTERRUPTIBLE); io_schedule_timeout(pause); @@ -577,8 +575,7 @@ static void balance_dirty_pages(struct address_space *mapping, pause = HZ / 10; } - if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && - bdi->dirty_exceeded) + if (!dirty_exceeded && bdi->dirty_exceeded) bdi->dirty_exceeded = 0; if (writeback_in_progress(bdi)) @@ -593,9 +590,7 @@ static void balance_dirty_pages(struct address_space *mapping, * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) - + global_page_state(NR_UNSTABLE_NFS)) - > background_thresh))) + (!laptop_mode && (nr_reclaimable > background_thresh))) bdi_start_background_writeback(bdi); } @@ -659,7 +654,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) unsigned long dirty_thresh; for ( ; ; ) { - get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); + global_dirty_limits(&background_thresh, &dirty_thresh); /* * Boost the allowable dirty threshold a bit for page @@ -805,6 +800,42 @@ void __init page_writeback_init(void) } /** + * tag_pages_for_writeback - tag pages to be written by write_cache_pages + * @mapping: address space structure to write + * @start: starting page index + * @end: ending page index (inclusive) + * + * This function scans the page range from @start to @end (inclusive) and tags + * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is + * that write_cache_pages (or whoever calls this function) will then use + * TOWRITE tag to identify pages eligible for writeback. This mechanism is + * used to avoid livelocking of writeback by a process steadily creating new + * dirty pages in the file (thus it is important for this function to be quick + * so that it can tag pages faster than a dirtying process can create them). + */ +/* + * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. + */ +void tag_pages_for_writeback(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ +#define WRITEBACK_TAG_BATCH 4096 + unsigned long tagged; + + do { + spin_lock_irq(&mapping->tree_lock); + tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, + &start, end, WRITEBACK_TAG_BATCH, + PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); + spin_unlock_irq(&mapping->tree_lock); + WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); + cond_resched(); + /* We check 'start' to handle wrapping when end == ~0UL */ + } while (tagged >= WRITEBACK_TAG_BATCH && start); +} +EXPORT_SYMBOL(tag_pages_for_writeback); + +/** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write @@ -818,6 +849,13 @@ void __init page_writeback_init(void) * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. + * + * To avoid livelocks (when other process dirties new pages), we first tag + * pages which should be written back with TOWRITE tag and only then start + * writing them. For data-integrity sync we have to be careful so that we do + * not miss some pages (e.g., because some other process has cleared TOWRITE + * tag we set). The rule we follow is that TOWRITE tag can be cleared only + * by the process clearing the DIRTY tag (and submitting the page for IO). */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, @@ -833,6 +871,7 @@ int write_cache_pages(struct address_space *mapping, pgoff_t done_index; int cycled; int range_whole = 0; + int tag; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { @@ -849,29 +888,19 @@ int write_cache_pages(struct address_space *mapping, if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ - - /* - * If this is a data integrity sync, cap the writeback to the - * current end of file. Any extension to the file that occurs - * after this is a new write and we don't need to write those - * pages out to fulfil our data integrity requirements. If we - * try to write them out, we can get stuck in this scan until - * the concurrent writer stops adding dirty pages and extending - * EOF. - */ - if (wbc->sync_mode == WB_SYNC_ALL && - wbc->range_end == LLONG_MAX) { - end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT; - } } - + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; @@ -929,6 +958,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; + trace_wbc_writepage(wbc, mapping->backing_dev_info); ret = (*writepage)(page, wbc, data); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { @@ -949,22 +979,16 @@ continue_unlock: } } - if (wbc->nr_to_write > 0) { - if (--wbc->nr_to_write == 0 && - wbc->sync_mode == WB_SYNC_NONE) { - /* - * We stop writing back only if we are - * not doing integrity sync. In case of - * integrity sync we have to keep going - * because someone may be concurrently - * dirtying pages, and we might have - * synced a lot of newly appeared dirty - * pages, but have not synced all of the - * old dirty pages. - */ - done = 1; - break; - } + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; } } pagevec_release(&pvec); @@ -1091,11 +1115,25 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_zone_page_state(page, NR_DIRTIED); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } } +EXPORT_SYMBOL(account_page_dirtied); + +/* + * Helper function for set_page_writeback family. + * NOTE: Unlike account_page_dirtied this does not rely on being atomic + * wrt interrupts. + */ +void account_page_writeback(struct page *page) +{ + inc_zone_page_state(page, NR_WRITEBACK); + inc_zone_page_state(page, NR_WRITTEN); +} +EXPORT_SYMBOL(account_page_writeback); /* * For address_spaces which do not use buffers. Just tag the page as dirty in @@ -1327,12 +1365,15 @@ int test_set_page_writeback(struct page *page) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_TOWRITE); spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestSetPageWriteback(page); } if (!ret) - inc_zone_page_state(page, NR_WRITEBACK); + account_page_writeback(page); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9bd339eb04c6..07a654486f75 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -21,6 +21,7 @@ #include <linux/pagemap.h> #include <linux/jiffies.h> #include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/kmemcheck.h> @@ -530,7 +531,7 @@ static inline void __free_one_page(struct page *page, * so it's less likely to be used soon and more likely to be merged * as a higher order page */ - if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { + if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { struct page *higher_page, *higher_buddy; combined_idx = __find_combined_index(page_idx, order); higher_page = page + combined_idx - page_idx; @@ -588,13 +589,13 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; + int to_free = count; spin_lock(&zone->lock); zone->all_unreclaimable = 0; zone->pages_scanned = 0; - __mod_zone_page_state(zone, NR_FREE_PAGES, count); - while (count) { + while (to_free) { struct page *page; struct list_head *list; @@ -619,8 +620,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, zone, 0, page_private(page)); trace_mm_page_pcpu_drain(page, 0, page_private(page)); - } while (--count && --batch_free && !list_empty(list)); + } while (--to_free && --batch_free && !list_empty(list)); } + __mod_zone_page_state(zone, NR_FREE_PAGES, count); spin_unlock(&zone->lock); } @@ -631,8 +633,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order, zone->all_unreclaimable = 0; zone->pages_scanned = 0; - __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); __free_one_page(page, zone, order, migratetype); + __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); spin_unlock(&zone->lock); } @@ -1461,7 +1463,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, { /* free_pages my go negative - that's OK */ long min = mark; - long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; + long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; int o; if (alloc_flags & ALLOC_HIGH) @@ -1738,7 +1740,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct page *page; /* Acquire the OOM killer lock for the zones in zonelist */ - if (!try_set_zone_oom(zonelist, gfp_mask)) { + if (!try_set_zonelist_oom(zonelist, gfp_mask)) { schedule_timeout_uninterruptible(1); return NULL; } @@ -1759,6 +1761,9 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* The OOM killer will not help higher order allocs */ if (order > PAGE_ALLOC_COSTLY_ORDER) goto out; + /* The OOM killer does not needlessly kill tasks for lowmem */ + if (high_zoneidx < ZONE_NORMAL) + goto out; /* * GFP_THISNODE contains __GFP_NORETRY and we never hit this. * Sanity check for bare calls of __GFP_THISNODE, not real OOM. @@ -1843,6 +1848,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; struct reclaim_state reclaim_state; struct task_struct *p = current; + bool drained = false; cond_resched(); @@ -1861,14 +1867,25 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, cond_resched(); - if (order != 0) - drain_all_pages(); + if (unlikely(!(*did_some_progress))) + return NULL; - if (likely(*did_some_progress)) - page = get_page_from_freelist(gfp_mask, nodemask, order, +retry: + page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); + + /* + * If an allocation failed after direct reclaim, it could be because + * pages are pinned on the per-cpu lists. Drain them and try again + */ + if (!page && !drained) { + drain_all_pages(); + drained = true; + goto retry; + } + return page; } @@ -1890,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) - congestion_wait(BLK_RW_ASYNC, HZ/50); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; @@ -1915,7 +1932,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) const gfp_t wait = gfp_mask & __GFP_WAIT; /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ - BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); + BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); /* * The caller may dip into page reserves a bit more if the caller @@ -1923,7 +1940,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). */ - alloc_flags |= (gfp_mask & __GFP_HIGH); + alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); if (!wait) { alloc_flags |= ALLOC_HARDER; @@ -2052,15 +2069,23 @@ rebalance: if (page) goto got_pg; - /* - * The OOM killer does not trigger for high-order - * ~__GFP_NOFAIL allocations so if no progress is being - * made, there are no other options and retrying is - * unlikely to help. - */ - if (order > PAGE_ALLOC_COSTLY_ORDER && - !(gfp_mask & __GFP_NOFAIL)) - goto nopage; + if (!(gfp_mask & __GFP_NOFAIL)) { + /* + * The oom killer is not called for high-order + * allocations that may fail, so if no progress + * is being made, there are no other options and + * retrying is unlikely to help. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto nopage; + /* + * The oom killer is not called for lowmem + * allocations to prevent needlessly killing + * innocent tasks. + */ + if (high_zoneidx < ZONE_NORMAL) + goto nopage; + } goto restart; } @@ -2070,7 +2095,7 @@ rebalance: pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { /* Wait for some write requests to complete then retry */ - congestion_wait(BLK_RW_ASYNC, HZ/50); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; } @@ -2412,7 +2437,7 @@ void show_free_areas(void) " all_unreclaimable? %s" "\n", zone->name, - K(zone_page_state(zone, NR_FREE_PAGES)), + K(zone_nr_free_pages(zone)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), @@ -3612,6 +3637,41 @@ void __init free_bootmem_with_active_regions(int nid, } } +#ifdef CONFIG_HAVE_MEMBLOCK +u64 __init find_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit) +{ + int i; + + /* Need to go over early_node_map to find out good range for node */ + for_each_active_range_index_in_nid(i, nid) { + u64 addr; + u64 ei_start, ei_last; + u64 final_start, final_end; + + ei_last = early_node_map[i].end_pfn; + ei_last <<= PAGE_SHIFT; + ei_start = early_node_map[i].start_pfn; + ei_start <<= PAGE_SHIFT; + + final_start = max(ei_start, goal); + final_end = min(ei_last, limit); + + if (final_start >= final_end) + continue; + + addr = memblock_find_in_range(final_start, final_end, size, align); + + if (addr == MEMBLOCK_ERROR) + continue; + + return addr; + } + + return MEMBLOCK_ERROR; +} +#endif + int __init add_from_early_node_map(struct range *range, int az, int nr_range, int nid) { @@ -3631,46 +3691,26 @@ int __init add_from_early_node_map(struct range *range, int az, void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit) { - int i; void *ptr; + u64 addr; - if (limit > get_max_mapped()) - limit = get_max_mapped(); + if (limit > memblock.current_limit) + limit = memblock.current_limit; - /* need to go over early_node_map to find out good range for node */ - for_each_active_range_index_in_nid(i, nid) { - u64 addr; - u64 ei_start, ei_last; + addr = find_memory_core_early(nid, size, align, goal, limit); - ei_last = early_node_map[i].end_pfn; - ei_last <<= PAGE_SHIFT; - ei_start = early_node_map[i].start_pfn; - ei_start <<= PAGE_SHIFT; - addr = find_early_area(ei_start, ei_last, - goal, limit, size, align); - - if (addr == -1ULL) - continue; - -#if 0 - printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n", - nid, - ei_start, ei_last, goal, limit, size, - align, addr); -#endif - - ptr = phys_to_virt(addr); - memset(ptr, 0, size); - reserve_early_without_check(addr, addr + size, "BOOTMEM"); - /* - * The min_count is set to 0 so that bootmem allocated blocks - * are never reported as leaks. - */ - kmemleak_alloc(ptr, size, 0, 0); - return ptr; - } + if (addr == MEMBLOCK_ERROR) + return NULL; - return NULL; + ptr = phys_to_virt(addr); + memset(ptr, 0, size); + memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(ptr, size, 0, 0); + return ptr; } #endif @@ -4089,8 +4129,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_seqlock_init(zone); zone->zone_pgdat = pgdat; - zone->prev_priority = DEF_PRIORITY; - zone_pcp_init(zone); for_each_lru(l) { INIT_LIST_HEAD(&zone->lru[l].list); @@ -5160,9 +5198,9 @@ void *__init alloc_large_system_hash(const char *tablename, if (!table) panic("Failed to allocate %s hash table\n", tablename); - printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", + printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", tablename, - (1U << log2qty), + (1UL << log2qty), ilog2(size) - PAGE_SHIFT, size); @@ -5259,12 +5297,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, * page allocater never alloc memory from ISOLATE block. */ +static int +__count_immobile_pages(struct zone *zone, struct page *page, int count) +{ + unsigned long pfn, iter, found; + /* + * For avoiding noise data, lru_add_drain_all() should be called + * If ZONE_MOVABLE, the zone never contains immobile pages + */ + if (zone_idx(zone) == ZONE_MOVABLE) + return true; + + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) + return true; + + pfn = page_to_pfn(page); + for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { + unsigned long check = pfn + iter; + + if (!pfn_valid_within(check)) { + iter++; + continue; + } + page = pfn_to_page(check); + if (!page_count(page)) { + if (PageBuddy(page)) + iter += (1 << page_order(page)) - 1; + continue; + } + if (!PageLRU(page)) + found++; + /* + * If there are RECLAIMABLE pages, we need to check it. + * But now, memory offline itself doesn't call shrink_slab() + * and it still to be fixed. + */ + /* + * If the page is not RAM, page_count()should be 0. + * we don't need more check. This is an _used_ not-movable page. + * + * The problematic thing here is PG_reserved pages. PG_reserved + * is set to both of a memory hole page and a _used_ kernel + * page at boot. + */ + if (found > count) + return false; + } + return true; +} + +bool is_pageblock_removable_nolock(struct page *page) +{ + struct zone *zone = page_zone(page); + return __count_immobile_pages(zone, page, 0); +} + int set_migratetype_isolate(struct page *page) { struct zone *zone; - struct page *curr_page; - unsigned long flags, pfn, iter; - unsigned long immobile = 0; + unsigned long flags, pfn; struct memory_isolate_notify arg; int notifier_ret; int ret = -EBUSY; @@ -5274,11 +5365,6 @@ int set_migratetype_isolate(struct page *page) zone_idx = zone_idx(zone); spin_lock_irqsave(&zone->lock, flags); - if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE || - zone_idx == ZONE_MOVABLE) { - ret = 0; - goto out; - } pfn = page_to_pfn(page); arg.start_pfn = pfn; @@ -5298,23 +5384,20 @@ int set_migratetype_isolate(struct page *page) */ notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); notifier_ret = notifier_to_errno(notifier_ret); - if (notifier_ret || !arg.pages_found) + if (notifier_ret) goto out; - - for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { - if (!pfn_valid_within(pfn)) - continue; - - curr_page = pfn_to_page(iter); - if (!page_count(curr_page) || PageLRU(curr_page)) - continue; - - immobile++; - } - - if (arg.pages_found == immobile) + /* + * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. + * We just check MOVABLE pages. + */ + if (__count_immobile_pages(zone, page, arg.pages_found)) ret = 0; + /* + * immobile means "not-on-lru" paes. If immobile is larger than + * removable-by-driver pages reported by notifier, we'll fail. + */ + out: if (!ret) { set_pageblock_migratetype(page, MIGRATE_ISOLATE); diff --git a/mm/page_io.c b/mm/page_io.c index 31a3b962230a..2dee975bf469 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) goto out; } if (wbc->sync_mode == WB_SYNC_ALL) - rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); + rw |= REQ_SYNC | REQ_UNPLUG; count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5e0ffd967452..4ae42bb40892 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -86,7 +86,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) * all pages in [start_pfn...end_pfn) must be in the same zone. * zone->lock must be held before call this. * - * Returns 0 if all pages in the range is isolated. + * Returns 1 if all pages in the range is isolated. */ static int __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) @@ -119,7 +119,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) struct zone *zone; int ret; - pfn = start_pfn; /* * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page * is not aligned to pageblock_nr_pages. diff --git a/mm/percpu-km.c b/mm/percpu-km.c index df680855540a..89633fefc6a2 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -27,7 +27,7 @@ * chunk size is not aligned. percpu-km code will whine about it. */ -#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK +#if defined(CONFIG_SMP) && defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) #error "contiguous percpu allocation is incompatible with paged first chunk" #endif @@ -35,7 +35,11 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) { - /* noop */ + unsigned int cpu; + + for_each_possible_cpu(cpu) + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); + return 0; } diff --git a/mm/percpu.c b/mm/percpu.c index 6470e7710231..efe816856a9d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -31,7 +31,7 @@ * as small as 4 bytes. The allocator organizes chunks into lists * according to free size and tries to allocate from the fullest one. * Each chunk keeps the maximum contiguous area size hint which is - * guaranteed to be eqaul to or larger than the maximum contiguous + * guaranteed to be equal to or larger than the maximum contiguous * area in the chunk. This helps the allocator not to iterate the * chunk maps unnecessarily. * @@ -76,6 +76,7 @@ #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ +#ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ #ifndef __addr_to_pcpu_ptr #define __addr_to_pcpu_ptr(addr) \ @@ -89,6 +90,11 @@ (unsigned long)pcpu_base_addr - \ (unsigned long)__per_cpu_start) #endif +#else /* CONFIG_SMP */ +/* on UP, it's always identity mapped */ +#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr) +#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr) +#endif /* CONFIG_SMP */ struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ @@ -282,6 +288,9 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, */ static void *pcpu_mem_alloc(size_t size) { + if (WARN_ON_ONCE(!slab_is_available())) + return NULL; + if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else { @@ -390,14 +399,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) goto out_unlock; old_size = chunk->map_alloc * sizeof(chunk->map[0]); - memcpy(new, chunk->map, old_size); + old = chunk->map; - /* - * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is - * one of the first chunks and still using static map. - */ - if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) - old = chunk->map; + memcpy(new, old, old_size); chunk->map_alloc = new_alloc; chunk->map = new; @@ -604,7 +608,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) { struct pcpu_chunk *chunk; - chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL); + chunk = pcpu_mem_alloc(pcpu_chunk_struct_size); if (!chunk) return NULL; @@ -822,8 +826,8 @@ fail_unlock_mutex: * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * - * Allocate percpu area of @size bytes aligned at @align. Might - * sleep. Might trigger writeouts. + * Allocate zero-filled percpu area of @size bytes aligned at @align. + * Might sleep. Might trigger writeouts. * * CONTEXT: * Does GFP_KERNEL allocation. @@ -842,9 +846,10 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * - * Allocate percpu area of @size bytes aligned at @align from reserved - * percpu area if arch has set it up; otherwise, allocation is served - * from the same dynamic area. Might sleep. Might trigger writeouts. + * Allocate zero-filled percpu area of @size bytes aligned at @align + * from reserved percpu area if arch has set it up; otherwise, + * allocation is served from the same dynamic area. Might sleep. + * Might trigger writeouts. * * CONTEXT: * Does GFP_KERNEL allocation. @@ -951,6 +956,7 @@ EXPORT_SYMBOL_GPL(free_percpu); */ bool is_kernel_percpu_address(unsigned long addr) { +#ifdef CONFIG_SMP const size_t static_size = __per_cpu_end - __per_cpu_start; void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); unsigned int cpu; @@ -961,6 +967,8 @@ bool is_kernel_percpu_address(unsigned long addr) if ((void *)addr >= start && (void *)addr < start + static_size) return true; } +#endif + /* on UP, can't distinguish from other static vars, always false */ return false; } @@ -1013,20 +1021,6 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) return page_to_phys(pcpu_addr_to_page(addr)); } -static inline size_t pcpu_calc_fc_sizes(size_t static_size, - size_t reserved_size, - ssize_t *dyn_sizep) -{ - size_t size_sum; - - size_sum = PFN_ALIGN(static_size + reserved_size + - (*dyn_sizep >= 0 ? *dyn_sizep : 0)); - if (*dyn_sizep != 0) - *dyn_sizep = size_sum - static_size - reserved_size; - - return size_sum; -} - /** * pcpu_alloc_alloc_info - allocate percpu allocation info * @nr_groups: the number of groups @@ -1083,157 +1077,6 @@ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) } /** - * pcpu_build_alloc_info - build alloc_info considering distances between CPUs - * @reserved_size: the size of reserved percpu area in bytes - * @dyn_size: free size for dynamic allocation in bytes, -1 for auto - * @atom_size: allocation atom size - * @cpu_distance_fn: callback to determine distance between cpus, optional - * - * This function determines grouping of units, their mappings to cpus - * and other parameters considering needed percpu size, allocation - * atom size and distances between CPUs. - * - * Groups are always mutliples of atom size and CPUs which are of - * LOCAL_DISTANCE both ways are grouped together and share space for - * units in the same group. The returned configuration is guaranteed - * to have CPUs on different nodes on different groups and >=75% usage - * of allocated virtual address space. - * - * RETURNS: - * On success, pointer to the new allocation_info is returned. On - * failure, ERR_PTR value is returned. - */ -struct pcpu_alloc_info * __init pcpu_build_alloc_info( - size_t reserved_size, ssize_t dyn_size, - size_t atom_size, - pcpu_fc_cpu_distance_fn_t cpu_distance_fn) -{ - static int group_map[NR_CPUS] __initdata; - static int group_cnt[NR_CPUS] __initdata; - const size_t static_size = __per_cpu_end - __per_cpu_start; - int nr_groups = 1, nr_units = 0; - size_t size_sum, min_unit_size, alloc_size; - int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ - int last_allocs, group, unit; - unsigned int cpu, tcpu; - struct pcpu_alloc_info *ai; - unsigned int *cpu_map; - - /* this function may be called multiple times */ - memset(group_map, 0, sizeof(group_map)); - memset(group_cnt, 0, sizeof(group_cnt)); - - /* - * Determine min_unit_size, alloc_size and max_upa such that - * alloc_size is multiple of atom_size and is the smallest - * which can accomodate 4k aligned segments which are equal to - * or larger than min_unit_size. - */ - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); - min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - - alloc_size = roundup(min_unit_size, atom_size); - upa = alloc_size / min_unit_size; - while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) - upa--; - max_upa = upa; - - /* group cpus according to their proximity */ - for_each_possible_cpu(cpu) { - group = 0; - next_group: - for_each_possible_cpu(tcpu) { - if (cpu == tcpu) - break; - if (group_map[tcpu] == group && cpu_distance_fn && - (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || - cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { - group++; - nr_groups = max(nr_groups, group + 1); - goto next_group; - } - } - group_map[cpu] = group; - group_cnt[group]++; - } - - /* - * Expand unit size until address space usage goes over 75% - * and then as much as possible without using more address - * space. - */ - last_allocs = INT_MAX; - for (upa = max_upa; upa; upa--) { - int allocs = 0, wasted = 0; - - if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) - continue; - - for (group = 0; group < nr_groups; group++) { - int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); - allocs += this_allocs; - wasted += this_allocs * upa - group_cnt[group]; - } - - /* - * Don't accept if wastage is over 25%. The - * greater-than comparison ensures upa==1 always - * passes the following check. - */ - if (wasted > num_possible_cpus() / 3) - continue; - - /* and then don't consume more memory */ - if (allocs > last_allocs) - break; - last_allocs = allocs; - best_upa = upa; - } - upa = best_upa; - - /* allocate and fill alloc_info */ - for (group = 0; group < nr_groups; group++) - nr_units += roundup(group_cnt[group], upa); - - ai = pcpu_alloc_alloc_info(nr_groups, nr_units); - if (!ai) - return ERR_PTR(-ENOMEM); - cpu_map = ai->groups[0].cpu_map; - - for (group = 0; group < nr_groups; group++) { - ai->groups[group].cpu_map = cpu_map; - cpu_map += roundup(group_cnt[group], upa); - } - - ai->static_size = static_size; - ai->reserved_size = reserved_size; - ai->dyn_size = dyn_size; - ai->unit_size = alloc_size / upa; - ai->atom_size = atom_size; - ai->alloc_size = alloc_size; - - for (group = 0, unit = 0; group_cnt[group]; group++) { - struct pcpu_group_info *gi = &ai->groups[group]; - - /* - * Initialize base_offset as if all groups are located - * back-to-back. The caller should update this to - * reflect actual allocation. - */ - gi->base_offset = unit * ai->unit_size; - - for_each_possible_cpu(cpu) - if (group_map[cpu] == group) - gi->cpu_map[gi->nr_units++] = cpu; - gi->nr_units = roundup(gi->nr_units, upa); - unit += gi->nr_units; - } - BUG_ON(unit != nr_units); - - return ai; -} - -/** * pcpu_dump_alloc_info - print out information about pcpu_alloc_info * @lvl: loglevel * @ai: allocation info to dump @@ -1350,7 +1193,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr) { static char cpus_buf[4096] __initdata; - static int smap[2], dmap[2]; + static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; + static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; size_t dyn_size = ai->dyn_size; size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; @@ -1373,14 +1217,15 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, } while (0) /* sanity checks */ - BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || - ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); +#ifdef CONFIG_SMP PCPU_SETUP_BUG_ON(!ai->static_size); +#endif PCPU_SETUP_BUG_ON(!base_addr); PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); + PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ @@ -1413,9 +1258,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (pcpu_first_unit_cpu == NR_CPUS) pcpu_first_unit_cpu = cpu; + pcpu_last_unit_cpu = cpu; } } - pcpu_last_unit_cpu = cpu; pcpu_nr_units = unit; for_each_possible_cpu(cpu) @@ -1500,6 +1345,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, return 0; } +#ifdef CONFIG_SMP + const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { [PCPU_FC_AUTO] = "auto", [PCPU_FC_EMBED] = "embed", @@ -1527,12 +1374,184 @@ static int __init percpu_alloc_setup(char *str) } early_param("percpu_alloc", percpu_alloc_setup); +/* + * pcpu_embed_first_chunk() is used by the generic percpu setup. + * Build it if needed by the arch config or the generic setup is going + * to be used. + */ #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) +#define BUILD_EMBED_FIRST_CHUNK +#endif + +/* build pcpu_page_first_chunk() iff needed by the arch config */ +#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) +#define BUILD_PAGE_FIRST_CHUNK +#endif + +/* pcpu_build_alloc_info() is used by both embed and page first chunk */ +#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK) +/** + * pcpu_build_alloc_info - build alloc_info considering distances between CPUs + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: minimum free size for dynamic allocation in bytes + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * + * This function determines grouping of units, their mappings to cpus + * and other parameters considering needed percpu size, allocation + * atom size and distances between CPUs. + * + * Groups are always mutliples of atom size and CPUs which are of + * LOCAL_DISTANCE both ways are grouped together and share space for + * units in the same group. The returned configuration is guaranteed + * to have CPUs on different nodes on different groups and >=75% usage + * of allocated virtual address space. + * + * RETURNS: + * On success, pointer to the new allocation_info is returned. On + * failure, ERR_PTR value is returned. + */ +static struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, size_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + static int group_map[NR_CPUS] __initdata; + static int group_cnt[NR_CPUS] __initdata; + const size_t static_size = __per_cpu_end - __per_cpu_start; + int nr_groups = 1, nr_units = 0; + size_t size_sum, min_unit_size, alloc_size; + int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int last_allocs, group, unit; + unsigned int cpu, tcpu; + struct pcpu_alloc_info *ai; + unsigned int *cpu_map; + + /* this function may be called multiple times */ + memset(group_map, 0, sizeof(group_map)); + memset(group_cnt, 0, sizeof(group_cnt)); + + /* calculate size_sum and ensure dyn_size is enough for early alloc */ + size_sum = PFN_ALIGN(static_size + reserved_size + + max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE)); + dyn_size = size_sum - static_size - reserved_size; + + /* + * Determine min_unit_size, alloc_size and max_upa such that + * alloc_size is multiple of atom_size and is the smallest + * which can accomodate 4k aligned segments which are equal to + * or larger than min_unit_size. + */ + min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + + alloc_size = roundup(min_unit_size, atom_size); + upa = alloc_size / min_unit_size; + while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + upa--; + max_upa = upa; + + /* group cpus according to their proximity */ + for_each_possible_cpu(cpu) { + group = 0; + next_group: + for_each_possible_cpu(tcpu) { + if (cpu == tcpu) + break; + if (group_map[tcpu] == group && cpu_distance_fn && + (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || + cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { + group++; + nr_groups = max(nr_groups, group + 1); + goto next_group; + } + } + group_map[cpu] = group; + group_cnt[group]++; + } + + /* + * Expand unit size until address space usage goes over 75% + * and then as much as possible without using more address + * space. + */ + last_allocs = INT_MAX; + for (upa = max_upa; upa; upa--) { + int allocs = 0, wasted = 0; + + if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + continue; + + for (group = 0; group < nr_groups; group++) { + int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); + allocs += this_allocs; + wasted += this_allocs * upa - group_cnt[group]; + } + + /* + * Don't accept if wastage is over 1/3. The + * greater-than comparison ensures upa==1 always + * passes the following check. + */ + if (wasted > num_possible_cpus() / 3) + continue; + + /* and then don't consume more memory */ + if (allocs > last_allocs) + break; + last_allocs = allocs; + best_upa = upa; + } + upa = best_upa; + + /* allocate and fill alloc_info */ + for (group = 0; group < nr_groups; group++) + nr_units += roundup(group_cnt[group], upa); + + ai = pcpu_alloc_alloc_info(nr_groups, nr_units); + if (!ai) + return ERR_PTR(-ENOMEM); + cpu_map = ai->groups[0].cpu_map; + + for (group = 0; group < nr_groups; group++) { + ai->groups[group].cpu_map = cpu_map; + cpu_map += roundup(group_cnt[group], upa); + } + + ai->static_size = static_size; + ai->reserved_size = reserved_size; + ai->dyn_size = dyn_size; + ai->unit_size = alloc_size / upa; + ai->atom_size = atom_size; + ai->alloc_size = alloc_size; + + for (group = 0, unit = 0; group_cnt[group]; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + + /* + * Initialize base_offset as if all groups are located + * back-to-back. The caller should update this to + * reflect actual allocation. + */ + gi->base_offset = unit * ai->unit_size; + + for_each_possible_cpu(cpu) + if (group_map[cpu] == group) + gi->cpu_map[gi->nr_units++] = cpu; + gi->nr_units = roundup(gi->nr_units, upa); + unit += gi->nr_units; + } + BUG_ON(unit != nr_units); + + return ai; +} +#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */ + +#if defined(BUILD_EMBED_FIRST_CHUNK) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @reserved_size: the size of reserved percpu area in bytes - * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @dyn_size: minimum free size for dynamic allocation in bytes * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * @alloc_fn: function to allocate percpu page @@ -1553,10 +1572,7 @@ early_param("percpu_alloc", percpu_alloc_setup); * vmalloc space is not orders of magnitude larger than distances * between node memory addresses (ie. 32bit NUMA machines). * - * When @dyn_size is positive, dynamic area might be larger than - * specified to fill page alignment. When @dyn_size is auto, - * @dyn_size is just big enough to fill page alignment after static - * and reserved areas. + * @dyn_size specifies the minimum dynamic area size. * * If the needed size is smaller than the minimum or specified unit * size, the leftover is returned using @free_fn. @@ -1564,7 +1580,7 @@ early_param("percpu_alloc", percpu_alloc_setup); * RETURNS: * 0 on success, -errno on failure. */ -int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, +int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, pcpu_fc_alloc_fn_t alloc_fn, @@ -1660,10 +1676,9 @@ out_free: free_bootmem(__pa(areas), areas_size); return rc; } -#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || - !CONFIG_HAVE_SETUP_PER_CPU_AREA */ +#endif /* BUILD_EMBED_FIRST_CHUNK */ -#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK +#ifdef BUILD_PAGE_FIRST_CHUNK /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes @@ -1695,7 +1710,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); - ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL); + ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL); if (IS_ERR(ai)) return PTR_ERR(ai); BUG_ON(ai->nr_groups != 1); @@ -1771,10 +1786,11 @@ out_free_ar: pcpu_free_alloc_info(ai); return rc; } -#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ +#endif /* BUILD_PAGE_FIRST_CHUNK */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA /* - * Generic percpu area setup. + * Generic SMP percpu area setup. * * The embedding helper is used because its behavior closely resembles * the original non-dynamic generic percpu area setup. This is @@ -1785,7 +1801,6 @@ out_free_ar: * on the physical linear memory mapping which uses large page * mappings on applicable archs. */ -#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); @@ -1814,10 +1829,75 @@ void __init setup_per_cpu_areas(void) PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); if (rc < 0) - panic("Failed to initialized percpu areas."); + panic("Failed to initialize percpu areas."); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } -#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ + +#else /* CONFIG_SMP */ + +/* + * UP percpu area setup. + * + * UP always uses km-based percpu allocator with identity mapping. + * Static percpu variables are indistinguishable from the usual static + * variables and don't require any special preparation. + */ +void __init setup_per_cpu_areas(void) +{ + const size_t unit_size = + roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE, + PERCPU_DYNAMIC_RESERVE)); + struct pcpu_alloc_info *ai; + void *fc; + + ai = pcpu_alloc_alloc_info(1, 1); + fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + if (!ai || !fc) + panic("Failed to allocate memory for percpu areas."); + + ai->dyn_size = unit_size; + ai->unit_size = unit_size; + ai->atom_size = unit_size; + ai->alloc_size = unit_size; + ai->groups[0].nr_units = 1; + ai->groups[0].cpu_map[0] = 0; + + if (pcpu_setup_first_chunk(ai, fc) < 0) + panic("Failed to initialize percpu areas."); +} + +#endif /* CONFIG_SMP */ + +/* + * First and reserved chunks are initialized with temporary allocation + * map in initdata so that they can be used before slab is online. + * This function is called after slab is brought up and replaces those + * with properly allocated maps. + */ +void __init percpu_init_late(void) +{ + struct pcpu_chunk *target_chunks[] = + { pcpu_first_chunk, pcpu_reserved_chunk, NULL }; + struct pcpu_chunk *chunk; + unsigned long flags; + int i; + + for (i = 0; (chunk = target_chunks[i]); i++) { + int *map; + const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]); + + BUILD_BUG_ON(size > PAGE_SIZE); + + map = pcpu_mem_alloc(size); + BUG_ON(!map); + + spin_lock_irqsave(&pcpu_lock, flags); + memcpy(map, chunk->map, size); + chunk->map = map; + spin_unlock_irqrestore(&pcpu_lock, flags); + } +} diff --git a/mm/percpu_up.c b/mm/percpu_up.c deleted file mode 100644 index c4351c7f57d2..000000000000 --- a/mm/percpu_up.c +++ /dev/null @@ -1,30 +0,0 @@ -/* - * mm/percpu_up.c - dummy percpu memory allocator implementation for UP - */ - -#include <linux/module.h> -#include <linux/percpu.h> -#include <linux/slab.h> - -void __percpu *__alloc_percpu(size_t size, size_t align) -{ - /* - * Can't easily make larger alignment work with kmalloc. WARN - * on it. Larger alignment should only be used for module - * percpu sections on SMP for which this path isn't used. - */ - WARN_ON_ONCE(align > SMP_CACHE_BYTES); - return kzalloc(size, GFP_KERNEL); -} -EXPORT_SYMBOL_GPL(__alloc_percpu); - -void free_percpu(void __percpu *p) -{ - kfree(p); -} -EXPORT_SYMBOL_GPL(free_percpu); - -phys_addr_t per_cpu_ptr_to_phys(void *addr) -{ - return __pa(addr); -} diff --git a/mm/rmap.c b/mm/rmap.c index 38a336e2eea1..1a8bf76bfd03 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -56,6 +56,7 @@ #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> #include <linux/migrate.h> +#include <linux/hugetlb.h> #include <asm/tlbflush.h> @@ -79,7 +80,7 @@ static inline struct anon_vma_chain *anon_vma_chain_alloc(void) return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); } -void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) +static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) { kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); } @@ -132,9 +133,14 @@ int anon_vma_prepare(struct vm_area_struct *vma) if (unlikely(!anon_vma)) goto out_enomem_free_avc; allocated = anon_vma; + /* + * This VMA had no anon_vma yet. This anon_vma is + * the root of any anon_vma tree that might form. + */ + anon_vma->root = anon_vma; } - spin_lock(&anon_vma->lock); + anon_vma_lock(anon_vma); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { @@ -142,12 +148,12 @@ int anon_vma_prepare(struct vm_area_struct *vma) avc->anon_vma = anon_vma; avc->vma = vma; list_add(&avc->same_vma, &vma->anon_vma_chain); - list_add(&avc->same_anon_vma, &anon_vma->head); + list_add_tail(&avc->same_anon_vma, &anon_vma->head); allocated = NULL; avc = NULL; } spin_unlock(&mm->page_table_lock); - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); if (unlikely(allocated)) anon_vma_free(allocated); @@ -170,9 +176,9 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); - spin_lock(&anon_vma->lock); + anon_vma_lock(anon_vma); list_add_tail(&avc->same_anon_vma, &anon_vma->head); - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); } /* @@ -224,9 +230,21 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) avc = anon_vma_chain_alloc(); if (!avc) goto out_error_free_anon_vma; - anon_vma_chain_link(vma, avc, anon_vma); + + /* + * The root anon_vma's spinlock is the lock actually used when we + * lock any of the anon_vmas in this anon_vma tree. + */ + anon_vma->root = pvma->anon_vma->root; + /* + * With KSM refcounts, an anon_vma can stay around longer than the + * process it belongs to. The root anon_vma needs to be pinned + * until this anon_vma is freed, because the lock lives in the root. + */ + get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; + anon_vma_chain_link(vma, avc, anon_vma); return 0; @@ -246,22 +264,29 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) if (!anon_vma) return; - spin_lock(&anon_vma->lock); + anon_vma_lock(anon_vma); list_del(&anon_vma_chain->same_anon_vma); /* We must garbage collect the anon_vma if it's empty */ empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); - if (empty) + if (empty) { + /* We no longer need the root anon_vma */ + if (anon_vma->root != anon_vma) + drop_anon_vma(anon_vma->root); anon_vma_free(anon_vma); + } } void unlink_anon_vmas(struct vm_area_struct *vma) { struct anon_vma_chain *avc, *next; - /* Unlink each anon_vma chained to the VMA. */ + /* + * Unlink each anon_vma chained to the VMA. This list is ordered + * from newest to oldest, ensuring the root anon_vma gets freed last. + */ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { anon_vma_unlink(avc); list_del(&avc->same_vma); @@ -289,9 +314,9 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *__page_lock_anon_vma(struct page *page) { - struct anon_vma *anon_vma; + struct anon_vma *anon_vma, *root_anon_vma; unsigned long anon_mapping; rcu_read_lock(); @@ -302,16 +327,31 @@ struct anon_vma *page_lock_anon_vma(struct page *page) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); - spin_lock(&anon_vma->lock); - return anon_vma; + root_anon_vma = ACCESS_ONCE(anon_vma->root); + spin_lock(&root_anon_vma->lock); + + /* + * If this page is still mapped, then its anon_vma cannot have been + * freed. But if it has been unmapped, we have no security against + * the anon_vma structure being freed and reused (for another anon_vma: + * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot + * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting + * anon_vma->root before page_unlock_anon_vma() is called to unlock. + */ + if (page_mapped(page)) + return anon_vma; + + spin_unlock(&root_anon_vma->lock); out: rcu_read_unlock(); return NULL; } void page_unlock_anon_vma(struct anon_vma *anon_vma) + __releases(&anon_vma->root->lock) + __releases(RCU) { - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); rcu_read_unlock(); } @@ -326,6 +366,8 @@ vma_address(struct page *page, struct vm_area_struct *vma) pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); unsigned long address; + if (unlikely(is_vm_hugetlb_page(vma))) + pgoff = page->index << huge_page_order(page_hstate(page)); address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { /* page should be within @vma mapping range */ @@ -340,9 +382,16 @@ vma_address(struct page *page, struct vm_area_struct *vma) */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { - if (PageAnon(page)) - ; - else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { + if (PageAnon(page)) { + struct anon_vma *page__anon_vma = page_anon_vma(page); + /* + * Note: swapoff's unuse_vma() is more efficient with this + * check, and needs it to match anon_vma when KSM is active. + */ + if (!vma->anon_vma || !page__anon_vma || + vma->anon_vma->root != page__anon_vma->root) + return -EFAULT; + } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) return -EFAULT; @@ -360,7 +409,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) * * On success returns with pte mapped and locked. */ -pte_t *page_check_address(struct page *page, struct mm_struct *mm, +pte_t *__page_check_address(struct page *page, struct mm_struct *mm, unsigned long address, spinlock_t **ptlp, int sync) { pgd_t *pgd; @@ -369,6 +418,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, pte_t *pte; spinlock_t *ptl; + if (unlikely(PageHuge(page))) { + pte = huge_pte_offset(mm, address); + ptl = &mm->page_table_lock; + goto check; + } + pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) return NULL; @@ -389,6 +444,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, } ptl = pte_lockptr(mm, pmd); +check: spin_lock(ptl); if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { *ptlp = ptl; @@ -691,7 +747,7 @@ int page_mkclean(struct page *page) if (mapping) { ret = page_mkclean_file(mapping, page); if (page_test_dirty(page)) { - page_clear_dirty(page); + page_clear_dirty(page, 1); ret = 1; } } @@ -726,10 +782,10 @@ void page_move_anon_rmap(struct page *page, } /** - * __page_set_anon_rmap - setup new anonymous rmap - * @page: the page to add the mapping to - * @vma: the vm area in which the mapping is added - * @address: the user virtual address mapped + * __page_set_anon_rmap - set up new anonymous rmap + * @page: Page to add to rmap + * @vma: VM area to add page to. + * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process */ static void __page_set_anon_rmap(struct page *page, @@ -739,19 +795,16 @@ static void __page_set_anon_rmap(struct page *page, BUG_ON(!anon_vma); + if (PageAnon(page)) + return; + /* * If the page isn't exclusively mapped into this vma, * we must use the _oldest_ possible anon_vma for the * page mapping! - * - * So take the last AVC chain entry in the vma, which is - * the deepest ancestor, and use the anon_vma from that. */ - if (!exclusive) { - struct anon_vma_chain *avc; - avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma); - anon_vma = avc->anon_vma; - } + if (!exclusive) + anon_vma = anon_vma->root; anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; page->mapping = (struct address_space *) anon_vma; @@ -780,6 +833,7 @@ static void __page_check_anon_rmap(struct page *page, * are initially only visible via the pagetables, and the pte is locked * over the call to page_add_new_anon_rmap. */ + BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); BUG_ON(page->index != linear_page_index(vma, address)); #endif } @@ -798,6 +852,17 @@ static void __page_check_anon_rmap(struct page *page, void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { + do_page_add_anon_rmap(page, vma, address, 0); +} + +/* + * Special version of the above for do_swap_page, which often runs + * into pages that are exclusively owned by the current process. + * Everybody else should continue to use page_add_anon_rmap above. + */ +void do_page_add_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address, int exclusive) +{ int first = atomic_inc_and_test(&page->_mapcount); if (first) __inc_zone_page_state(page, NR_ANON_PAGES); @@ -807,7 +872,7 @@ void page_add_anon_rmap(struct page *page, VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); if (first) - __page_set_anon_rmap(page, vma, address, 0); + __page_set_anon_rmap(page, vma, address, exclusive); else __page_check_anon_rmap(page, vma, address); } @@ -870,9 +935,15 @@ void page_remove_rmap(struct page *page) * containing the swap entry, but page not yet written to swap. */ if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { - page_clear_dirty(page); + page_clear_dirty(page, 1); set_page_dirty(page); } + /* + * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED + * and not charged by memcg for now. + */ + if (unlikely(PageHuge(page))) + return; if (PageAnon(page)) { mem_cgroup_uncharge_page(page); __dec_zone_page_state(page, NR_ANON_PAGES); @@ -1368,6 +1439,42 @@ int try_to_munlock(struct page *page) return try_to_unmap_file(page, TTU_MUNLOCK); } +#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION) +/* + * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root + * if necessary. Be careful to do all the tests under the lock. Once + * we know we are the last user, nobody else can get a reference and we + * can do the freeing without the lock. + */ +void drop_anon_vma(struct anon_vma *anon_vma) +{ + BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0); + if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) { + struct anon_vma *root = anon_vma->root; + int empty = list_empty(&anon_vma->head); + int last_root_user = 0; + int root_empty = 0; + + /* + * The refcount on a non-root anon_vma got dropped. Drop + * the refcount on the root and check if we need to free it. + */ + if (empty && anon_vma != root) { + BUG_ON(atomic_read(&root->external_refcount) <= 0); + last_root_user = atomic_dec_and_test(&root->external_refcount); + root_empty = list_empty(&root->head); + } + anon_vma_unlock(anon_vma); + + if (empty) { + anon_vma_free(anon_vma); + if (root_empty && last_root_user) + anon_vma_free(root); + } + } +} +#endif + #ifdef CONFIG_MIGRATION /* * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): @@ -1389,7 +1496,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, anon_vma = page_anon_vma(page); if (!anon_vma) return ret; - spin_lock(&anon_vma->lock); + anon_vma_lock(anon_vma); list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); @@ -1399,7 +1506,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, if (ret != SWAP_AGAIN) break; } - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); return ret; } @@ -1445,3 +1552,49 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *, return rmap_walk_file(page, rmap_one, arg); } #endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_HUGETLB_PAGE +/* + * The following three functions are for anonymous (private mapped) hugepages. + * Unlike common anonymous pages, anonymous hugepages have no accounting code + * and no lru code, because we handle hugepages differently from common pages. + */ +static void __hugepage_set_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address, int exclusive) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + BUG_ON(!anon_vma); + + if (PageAnon(page)) + return; + if (!exclusive) + anon_vma = anon_vma->root; + + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + page->mapping = (struct address_space *) anon_vma; + page->index = linear_page_index(vma, address); +} + +void hugepage_add_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct anon_vma *anon_vma = vma->anon_vma; + int first; + + BUG_ON(!PageLocked(page)); + BUG_ON(!anon_vma); + BUG_ON(address < vma->vm_start || address >= vma->vm_end); + first = atomic_inc_and_test(&page->_mapcount); + if (first) + __hugepage_set_anon_rmap(page, vma, address, 0); +} + +void hugepage_add_new_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + BUG_ON(address < vma->vm_start || address >= vma->vm_end); + atomic_set(&page->_mapcount, 0); + __hugepage_set_anon_rmap(page, vma, address, 1); +} +#endif /* CONFIG_HUGETLB_PAGE */ diff --git a/mm/shmem.c b/mm/shmem.c index f65f84062db5..47fdeeb9d636 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -28,6 +28,7 @@ #include <linux/file.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/percpu_counter.h> #include <linux/swap.h> static struct vfsmount *shm_mnt; @@ -233,10 +234,10 @@ static void shmem_free_blocks(struct inode *inode, long pages) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) { - spin_lock(&sbinfo->stat_lock); - sbinfo->free_blocks += pages; + percpu_counter_add(&sbinfo->used_blocks, -pages); + spin_lock(&inode->i_lock); inode->i_blocks -= pages*BLOCKS_PER_PAGE; - spin_unlock(&sbinfo->stat_lock); + spin_unlock(&inode->i_lock); } } @@ -416,19 +417,17 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long if (sgp == SGP_READ) return shmem_swp_map(ZERO_PAGE(0)); /* - * Test free_blocks against 1 not 0, since we have 1 data + * Test used_blocks against 1 less max_blocks, since we have 1 data * page (and perhaps indirect index pages) yet to allocate: * a waste to allocate index if we cannot allocate data. */ if (sbinfo->max_blocks) { - spin_lock(&sbinfo->stat_lock); - if (sbinfo->free_blocks <= 1) { - spin_unlock(&sbinfo->stat_lock); + if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0) return ERR_PTR(-ENOSPC); - } - sbinfo->free_blocks--; + percpu_counter_inc(&sbinfo->used_blocks); + spin_lock(&inode->i_lock); inode->i_blocks += BLOCKS_PER_PAGE; - spin_unlock(&sbinfo->stat_lock); + spin_unlock(&inode->i_lock); } spin_unlock(&info->lock); @@ -767,6 +766,10 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) loff_t newsize = attr->ia_size; int error; + error = inode_change_ok(inode, attr); + if (error) + return error; + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE) && newsize != inode->i_size) { struct page *page = NULL; @@ -801,25 +804,22 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) } } - error = simple_setsize(inode, newsize); + /* XXX(truncate): truncate_setsize should be called last */ + truncate_setsize(inode, newsize); if (page) page_cache_release(page); - if (error) - return error; shmem_truncate_range(inode, newsize, (loff_t)-1); } - error = inode_change_ok(inode, attr); - if (!error) - generic_setattr(inode, attr); + setattr_copy(inode, attr); #ifdef CONFIG_TMPFS_POSIX_ACL - if (!error && (attr->ia_valid & ATTR_MODE)) + if (attr->ia_valid & ATTR_MODE) error = generic_acl_chmod(inode); #endif return error; } -static void shmem_delete_inode(struct inode *inode) +static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); @@ -836,7 +836,7 @@ static void shmem_delete_inode(struct inode *inode) } BUG_ON(inode->i_blocks); shmem_free_inode(inode->i_sb); - clear_inode(inode); + end_writeback(inode); } static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) @@ -933,7 +933,7 @@ found: /* * Move _head_ to start search for next from here. - * But be careful: shmem_delete_inode checks list_empty without taking + * But be careful: shmem_evict_inode checks list_empty without taking * mutex, and there's an instant in list_move_tail when info->swaplist * would appear empty, if it were the only one on shmem_swaplist. We * could avoid doing it if inode NULL; or use this minor optimization. @@ -1223,6 +1223,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, struct shmem_sb_info *sbinfo; struct page *filepage = *pagep; struct page *swappage; + struct page *prealloc_page = NULL; swp_entry_t *entry; swp_entry_t swap; gfp_t gfp; @@ -1247,7 +1248,6 @@ repeat: filepage = find_lock_page(mapping, idx); if (filepage && PageUptodate(filepage)) goto done; - error = 0; gfp = mapping_gfp_mask(mapping); if (!filepage) { /* @@ -1258,7 +1258,19 @@ repeat: if (error) goto failed; radix_tree_preload_end(); + if (sgp != SGP_READ && !prealloc_page) { + /* We don't care if this fails */ + prealloc_page = shmem_alloc_page(gfp, info, idx); + if (prealloc_page) { + if (mem_cgroup_cache_charge(prealloc_page, + current->mm, GFP_KERNEL)) { + page_cache_release(prealloc_page); + prealloc_page = NULL; + } + } + } } + error = 0; spin_lock(&info->lock); shmem_recalc_inode(inode); @@ -1387,17 +1399,16 @@ repeat: shmem_swp_unmap(entry); sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) { - spin_lock(&sbinfo->stat_lock); - if (sbinfo->free_blocks == 0 || + if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) || shmem_acct_block(info->flags)) { - spin_unlock(&sbinfo->stat_lock); spin_unlock(&info->lock); error = -ENOSPC; goto failed; } - sbinfo->free_blocks--; + percpu_counter_inc(&sbinfo->used_blocks); + spin_lock(&inode->i_lock); inode->i_blocks += BLOCKS_PER_PAGE; - spin_unlock(&sbinfo->stat_lock); + spin_unlock(&inode->i_lock); } else if (shmem_acct_block(info->flags)) { spin_unlock(&info->lock); error = -ENOSPC; @@ -1407,28 +1418,38 @@ repeat: if (!filepage) { int ret; - spin_unlock(&info->lock); - filepage = shmem_alloc_page(gfp, info, idx); - if (!filepage) { - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - error = -ENOMEM; - goto failed; - } - SetPageSwapBacked(filepage); + if (!prealloc_page) { + spin_unlock(&info->lock); + filepage = shmem_alloc_page(gfp, info, idx); + if (!filepage) { + shmem_unacct_blocks(info->flags, 1); + shmem_free_blocks(inode, 1); + error = -ENOMEM; + goto failed; + } + SetPageSwapBacked(filepage); - /* Precharge page while we can wait, compensate after */ - error = mem_cgroup_cache_charge(filepage, current->mm, - GFP_KERNEL); - if (error) { - page_cache_release(filepage); - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - filepage = NULL; - goto failed; + /* + * Precharge page while we can wait, compensate + * after + */ + error = mem_cgroup_cache_charge(filepage, + current->mm, GFP_KERNEL); + if (error) { + page_cache_release(filepage); + shmem_unacct_blocks(info->flags, 1); + shmem_free_blocks(inode, 1); + filepage = NULL; + goto failed; + } + + spin_lock(&info->lock); + } else { + filepage = prealloc_page; + prealloc_page = NULL; + SetPageSwapBacked(filepage); } - spin_lock(&info->lock); entry = shmem_swp_alloc(info, idx, sgp); if (IS_ERR(entry)) error = PTR_ERR(entry); @@ -1469,13 +1490,19 @@ repeat: } done: *pagep = filepage; - return 0; + error = 0; + goto out; failed: if (*pagep != filepage) { unlock_page(filepage); page_cache_release(filepage); } +out: + if (prealloc_page) { + mem_cgroup_uncharge_cache_page(prealloc_page); + page_cache_release(prealloc_page); + } return error; } @@ -1559,6 +1586,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); inode->i_blocks = 0; inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; @@ -1791,17 +1819,16 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = TMPFS_MAGIC; buf->f_bsize = PAGE_CACHE_SIZE; buf->f_namelen = NAME_MAX; - spin_lock(&sbinfo->stat_lock); if (sbinfo->max_blocks) { buf->f_blocks = sbinfo->max_blocks; - buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; + buf->f_bavail = buf->f_bfree = + sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); } if (sbinfo->max_inodes) { buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_inodes; } /* else leave those fields 0 like simple_statfs */ - spin_unlock(&sbinfo->stat_lock); return 0; } @@ -1877,7 +1904,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; inc_nlink(inode); - atomic_inc(&inode->i_count); /* New dentry reference */ + ihold(inode); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ d_instantiate(dentry, inode); out: @@ -2120,7 +2147,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, if (*len < 3) return 255; - if (hlist_unhashed(&inode->i_hash)) { + if (inode_unhashed(inode)) { /* Unfortunately insert_inode_hash is not idempotent, * so as we hash inodes here rather than at creation * time, we need a lock to ensure we only try @@ -2128,7 +2155,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, */ static DEFINE_SPINLOCK(lock); spin_lock(&lock); - if (hlist_unhashed(&inode->i_hash)) + if (inode_unhashed(inode)) __insert_inode_hash(inode, inode->i_ino + inode->i_generation); spin_unlock(&lock); @@ -2242,7 +2269,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); struct shmem_sb_info config = *sbinfo; - unsigned long blocks; unsigned long inodes; int error = -EINVAL; @@ -2250,9 +2276,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) return error; spin_lock(&sbinfo->stat_lock); - blocks = sbinfo->max_blocks - sbinfo->free_blocks; inodes = sbinfo->max_inodes - sbinfo->free_inodes; - if (config.max_blocks < blocks) + if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0) goto out; if (config.max_inodes < inodes) goto out; @@ -2269,7 +2294,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) error = 0; sbinfo->max_blocks = config.max_blocks; - sbinfo->free_blocks = config.max_blocks - blocks; sbinfo->max_inodes = config.max_inodes; sbinfo->free_inodes = config.max_inodes - inodes; @@ -2302,7 +2326,10 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs) static void shmem_put_super(struct super_block *sb) { - kfree(sb->s_fs_info); + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + + percpu_counter_destroy(&sbinfo->used_blocks); + kfree(sbinfo); sb->s_fs_info = NULL; } @@ -2344,7 +2371,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) #endif spin_lock_init(&sbinfo->stat_lock); - sbinfo->free_blocks = sbinfo->max_blocks; + if (percpu_counter_init(&sbinfo->used_blocks, 0)) + goto failed; sbinfo->free_inodes = sbinfo->max_inodes; sb->s_maxbytes = SHMEM_MAX_BYTES; @@ -2496,7 +2524,7 @@ static const struct super_operations shmem_ops = { .remount_fs = shmem_remount_fs, .show_options = shmem_show_options, #endif - .delete_inode = shmem_delete_inode, + .evict_inode = shmem_evict_inode, .drop_inode = generic_delete_inode, .put_super = shmem_put_super, }; @@ -2510,16 +2538,16 @@ static const struct vm_operations_struct shmem_vm_ops = { }; -static int shmem_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *shmem_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); + return mount_nodev(fs_type, flags, data, shmem_fill_super); } static struct file_system_type tmpfs_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", - .get_sb = shmem_get_sb, + .mount = shmem_mount, .kill_sb = kill_litter_super, }; @@ -2615,7 +2643,7 @@ out: static struct file_system_type tmpfs_fs_type = { .name = "tmpfs", - .get_sb = ramfs_get_sb, + .mount = ramfs_mount, .kill_sb = kill_litter_super, }; diff --git a/mm/slab.c b/mm/slab.c index e49f8f46f46d..b1e40dafbab3 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -102,7 +102,6 @@ #include <linux/cpu.h> #include <linux/sysctl.h> #include <linux/module.h> -#include <linux/kmemtrace.h> #include <linux/rcupdate.h> #include <linux/string.h> #include <linux/uaccess.h> @@ -395,7 +394,7 @@ static void kmem_list3_init(struct kmem_list3 *parent) #define STATS_DEC_ACTIVE(x) do { } while (0) #define STATS_INC_ALLOCED(x) do { } while (0) #define STATS_INC_GROWN(x) do { } while (0) -#define STATS_ADD_REAPED(x,y) do { } while (0) +#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0) #define STATS_SET_HIGH(x) do { } while (0) #define STATS_INC_ERR(x) do { } while (0) #define STATS_INC_NODEALLOCS(x) do { } while (0) @@ -861,7 +860,7 @@ static void __cpuinit start_cpu_timer(int cpu) */ if (keventd_up() && reap_work->work.func == NULL) { init_reap_node(cpu); - INIT_DELAYED_WORK(reap_work, cache_reap); + INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap); schedule_delayed_work_on(cpu, reap_work, __round_jiffies_relative(HZ, cpu)); } @@ -902,7 +901,7 @@ static int transfer_objects(struct array_cache *to, struct array_cache *from, unsigned int max) { /* Figure out how many entries to transfer */ - int nr = min(min(from->avail, max), to->limit - to->avail); + int nr = min3(from->avail, max, to->limit - to->avail); if (!nr) return 0; @@ -2331,8 +2330,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size - && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) { - cachep->obj_offset += PAGE_SIZE - size; + && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { + cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); size = PAGE_SIZE; } #endif diff --git a/mm/slob.c b/mm/slob.c index 23631e2bb57a..617b6d6c42c7 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -66,8 +66,10 @@ #include <linux/module.h> #include <linux/rcupdate.h> #include <linux/list.h> -#include <linux/kmemtrace.h> #include <linux/kmemleak.h> + +#include <trace/events/kmem.h> + #include <asm/atomic.h> /* @@ -394,6 +396,7 @@ static void slob_free(void *block, int size) slob_t *prev, *next, *b = (slob_t *)block; slobidx_t units; unsigned long flags; + struct list_head *slob_list; if (unlikely(ZERO_OR_NULL_PTR(block))) return; @@ -422,7 +425,13 @@ static void slob_free(void *block, int size) set_slob(b, units, (void *)((unsigned long)(b + SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); - set_slob_page_free(sp, &free_slob_small); + if (size < SLOB_BREAK1) + slob_list = &free_slob_small; + else if (size < SLOB_BREAK2) + slob_list = &free_slob_medium; + else + slob_list = &free_slob_large; + set_slob_page_free(sp, slob_list); goto out; } @@ -491,7 +500,9 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) } else { unsigned int order = get_order(size); - ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node); + if (likely(order)) + gfp |= __GFP_COMP; + ret = slob_new_pages(gfp, order, node); if (ret) { struct page *page; page = virt_to_page(ret); @@ -639,7 +650,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b) if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { struct slob_rcu *slob_rcu; slob_rcu = b + (c->size - sizeof(struct slob_rcu)); - INIT_RCU_HEAD(&slob_rcu->head); slob_rcu->size = c->size; call_rcu(&slob_rcu->head, kmem_rcu_free); } else { diff --git a/mm/slub.c b/mm/slub.c index 578f68f3c51f..8fd5401bb071 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -17,7 +17,6 @@ #include <linux/slab.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <linux/kmemtrace.h> #include <linux/kmemcheck.h> #include <linux/cpu.h> #include <linux/cpuset.h> @@ -107,11 +106,17 @@ * the fast path and disables lockless freelists. */ +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_DEBUG_FREE) + +static inline int kmem_cache_debug(struct kmem_cache *s) +{ #ifdef CONFIG_SLUB_DEBUG -#define SLABDEBUG 1 + return unlikely(s->flags & SLAB_DEBUG_FLAGS); #else -#define SLABDEBUG 0 + return 0; #endif +} /* * Issues still to be resolved: @@ -162,8 +167,7 @@ #define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ /* Internal SLUB flags */ -#define __OBJECT_POISON 0x80000000 /* Poison object */ -#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ +#define __OBJECT_POISON 0x80000000UL /* Poison object */ static int kmem_size = sizeof(struct kmem_cache); @@ -173,7 +177,7 @@ static struct notifier_block slab_notifier; static enum { DOWN, /* No slab functionality available */ - PARTIAL, /* kmem_cache_open() works but kmalloc does not */ + PARTIAL, /* Kmem_cache_node works */ UP, /* Everything works but does not show up in sysfs */ SYSFS /* Sysfs up */ } slab_state = DOWN; @@ -194,7 +198,7 @@ struct track { enum track_item { TRACK_ALLOC, TRACK_FREE }; -#ifdef CONFIG_SLUB_DEBUG +#ifdef CONFIG_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void sysfs_slab_remove(struct kmem_cache *); @@ -205,6 +209,7 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } static inline void sysfs_slab_remove(struct kmem_cache *s) { + kfree(s->name); kfree(s); } @@ -228,11 +233,7 @@ int slab_is_available(void) static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) { -#ifdef CONFIG_NUMA return s->node[node]; -#else - return &s->local_node; -#endif } /* Verify that a pointer has an address that is valid within a slab page */ @@ -489,7 +490,7 @@ static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) dump_stack(); } -static void init_object(struct kmem_cache *s, void *object, int active) +static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = object; @@ -499,9 +500,7 @@ static void init_object(struct kmem_cache *s, void *object, int active) } if (s->flags & SLAB_RED_ZONE) - memset(p + s->objsize, - active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, - s->inuse - s->objsize); + memset(p + s->objsize, val, s->inuse - s->objsize); } static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) @@ -636,17 +635,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) } static int check_object(struct kmem_cache *s, struct page *page, - void *object, int active) + void *object, u8 val) { u8 *p = object; u8 *endobject = object + s->objsize; if (s->flags & SLAB_RED_ZONE) { - unsigned int red = - active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; - if (!check_bytes_and_report(s, page, object, "Redzone", - endobject, red, s->inuse - s->objsize)) + endobject, val, s->inuse - s->objsize)) return 0; } else { if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { @@ -656,7 +652,7 @@ static int check_object(struct kmem_cache *s, struct page *page, } if (s->flags & SLAB_POISON) { - if (!active && (s->flags & __OBJECT_POISON) && + if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && (!check_bytes_and_report(s, page, p, "Poison", p, POISON_FREE, s->objsize - 1) || !check_bytes_and_report(s, page, p, "Poison", @@ -668,7 +664,7 @@ static int check_object(struct kmem_cache *s, struct page *page, check_pad_bytes(s, page, p); } - if (!s->offset && active) + if (!s->offset && val == SLUB_RED_ACTIVE) /* * Object and freepointer overlap. Cannot check * freepointer while object is allocated. @@ -787,6 +783,39 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, } /* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. + */ +static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(flags & __GFP_WAIT); + + return should_failslab(s->objsize, flags, s->flags); +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) +{ + flags &= gfp_allowed_mask; + kmemcheck_slab_alloc(s, flags, object, s->objsize); + kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); +} + +static inline void slab_free_hook(struct kmem_cache *s, void *x) +{ + kmemleak_free_recursive(x, s->flags); +} + +static inline void slab_free_hook_irq(struct kmem_cache *s, void *object) +{ + kmemcheck_slab_free(s, object, s->objsize); + debug_check_no_locks_freed(object, s->objsize); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(object, s->objsize); +} + +/* * Tracking of fully allocated slabs for debugging purposes. */ static void add_full(struct kmem_cache_node *n, struct page *page) @@ -833,7 +862,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) * dilemma by deferring the increment of the count during * bootstrap (see early_kmem_cache_node_alloc). */ - if (!NUMA_BUILD || n) { + if (n) { atomic_long_inc(&n->nr_slabs); atomic_long_add(objects, &n->total_objects); } @@ -853,11 +882,11 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) return; - init_object(s, object, 0); + init_object(s, object, SLUB_RED_INACTIVE); init_tracking(s, object); } -static int alloc_debug_processing(struct kmem_cache *s, struct page *page, +static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { if (!check_slab(s, page)) @@ -873,14 +902,14 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page, goto bad; } - if (!check_object(s, page, object, 0)) + if (!check_object(s, page, object, SLUB_RED_INACTIVE)) goto bad; /* Success perform special debug activities for allocs */ if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_ALLOC, addr); trace(s, page, object, 1); - init_object(s, object, 1); + init_object(s, object, SLUB_RED_ACTIVE); return 1; bad: @@ -897,8 +926,8 @@ bad: return 0; } -static int free_debug_processing(struct kmem_cache *s, struct page *page, - void *object, unsigned long addr) +static noinline int free_debug_processing(struct kmem_cache *s, + struct page *page, void *object, unsigned long addr) { if (!check_slab(s, page)) goto fail; @@ -913,7 +942,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, goto fail; } - if (!check_object(s, page, object, 1)) + if (!check_object(s, page, object, SLUB_RED_ACTIVE)) return 0; if (unlikely(s != page->slab)) { @@ -937,7 +966,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); - init_object(s, object, 0); + init_object(s, object, SLUB_RED_INACTIVE); return 1; fail: @@ -1041,7 +1070,7 @@ static inline int free_debug_processing(struct kmem_cache *s, static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } static inline int check_object(struct kmem_cache *s, struct page *page, - void *object, int active) { return 1; } + void *object, u8 val) { return 1; } static inline void add_full(struct kmem_cache_node *n, struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, @@ -1061,7 +1090,19 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} -#endif + +static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) + { return 0; } + +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, + void *object) {} + +static inline void slab_free_hook(struct kmem_cache *s, void *x) {} + +static inline void slab_free_hook_irq(struct kmem_cache *s, + void *object) {} + +#endif /* CONFIG_SLUB_DEBUG */ /* * Slab allocation and freeing @@ -1073,7 +1114,7 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node, flags |= __GFP_NOTRACK; - if (node == -1) + if (node == NUMA_NO_NODE) return alloc_pages(flags, order); else return alloc_pages_exact_node(node, flags, order); @@ -1157,9 +1198,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) inc_slabs_node(s, page_to_nid(page), page->objects); page->slab = s; page->flags |= 1 << PG_slab; - if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | - SLAB_STORE_USER | SLAB_TRACE)) - __SetPageSlubDebug(page); start = page_address(page); @@ -1186,14 +1224,13 @@ static void __free_slab(struct kmem_cache *s, struct page *page) int order = compound_order(page); int pages = 1 << order; - if (unlikely(SLABDEBUG && PageSlubDebug(page))) { + if (kmem_cache_debug(s)) { void *p; slab_pad_check(s, page); for_each_object(p, s, page_address(page), page->objects) - check_object(s, page, p, 0); - __ClearPageSlubDebug(page); + check_object(s, page, p, SLUB_RED_INACTIVE); } kmemcheck_free_shadow(page, compound_order(page)); @@ -1273,13 +1310,19 @@ static void add_partial(struct kmem_cache_node *n, spin_unlock(&n->list_lock); } +static inline void __remove_partial(struct kmem_cache_node *n, + struct page *page) +{ + list_del(&page->lru); + n->nr_partial--; +} + static void remove_partial(struct kmem_cache *s, struct page *page) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); spin_lock(&n->list_lock); - list_del(&page->lru); - n->nr_partial--; + __remove_partial(n, page); spin_unlock(&n->list_lock); } @@ -1292,8 +1335,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page) { if (slab_trylock(page)) { - list_del(&page->lru); - n->nr_partial--; + __remove_partial(n, page); __SetPageSlubFrozen(page); return 1; } @@ -1387,10 +1429,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; - int searchnode = (node == -1) ? numa_node_id() : node; + int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; page = get_partial_node(get_node(s, searchnode)); - if (page || (flags & __GFP_THISNODE)) + if (page || node != -1) return page; return get_any_partial(s, flags); @@ -1404,6 +1446,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) * On exit the slab lock will have been dropped. */ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) + __releases(bitlock) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); @@ -1415,8 +1458,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); } else { stat(s, DEACTIVATE_FULL); - if (SLABDEBUG && PageSlubDebug(page) && - (s->flags & SLAB_STORE_USER)) + if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) add_full(n, page); } slab_unlock(page); @@ -1447,6 +1489,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) * Remove the cpu slab */ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) + __releases(bitlock) { struct page *page = c->page; int tail = 1; @@ -1515,7 +1558,7 @@ static void flush_all(struct kmem_cache *s) static inline int node_match(struct kmem_cache_cpu *c, int node) { #ifdef CONFIG_NUMA - if (node != -1 && c->node != node) + if (node != NUMA_NO_NODE && c->node != node) return 0; #endif return 1; @@ -1624,7 +1667,7 @@ load_freelist: object = c->page->freelist; if (unlikely(!object)) goto another_slab; - if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) + if (kmem_cache_debug(s)) goto debug; c->freelist = get_freepointer(s, object); @@ -1647,6 +1690,7 @@ new_slab: goto load_freelist; } + gfpflags &= gfp_allowed_mask; if (gfpflags & __GFP_WAIT) local_irq_enable(); @@ -1674,7 +1718,7 @@ debug: c->page->inuse++; c->page->freelist = get_freepointer(s, object); - c->node = -1; + c->node = NUMA_NO_NODE; goto unlock_out; } @@ -1695,12 +1739,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, struct kmem_cache_cpu *c; unsigned long flags; - gfpflags &= gfp_allowed_mask; - - lockdep_trace_alloc(gfpflags); - might_sleep_if(gfpflags & __GFP_WAIT); - - if (should_failslab(s->objsize, gfpflags, s->flags)) + if (slab_pre_alloc_hook(s, gfpflags)) return NULL; local_irq_save(flags); @@ -1719,15 +1758,14 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, if (unlikely(gfpflags & __GFP_ZERO) && object) memset(object, 0, s->objsize); - kmemcheck_slab_alloc(s, gfpflags, object, s->objsize); - kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags); + slab_post_alloc_hook(s, gfpflags, object); return object; } void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); @@ -1738,7 +1776,7 @@ EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) { - return slab_alloc(s, gfpflags, -1, _RET_IP_); + return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_alloc_notrace); #endif @@ -1754,7 +1792,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); -#endif #ifdef CONFIG_TRACING void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, @@ -1765,6 +1802,7 @@ void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, } EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); #endif +#endif /* * Slow patch handling. This may still be called frequently since objects @@ -1783,7 +1821,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_SLOWPATH); slab_lock(page); - if (unlikely(SLABDEBUG && PageSlubDebug(page))) + if (kmem_cache_debug(s)) goto debug; checks_ok: @@ -1850,14 +1888,14 @@ static __always_inline void slab_free(struct kmem_cache *s, struct kmem_cache_cpu *c; unsigned long flags; - kmemleak_free_recursive(x, s->flags); + slab_free_hook(s, x); + local_irq_save(flags); c = __this_cpu_ptr(s->cpu_slab); - kmemcheck_slab_free(s, object, s->objsize); - debug_check_no_locks_freed(object, s->objsize); - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, s->objsize); - if (likely(page == c->page && c->node >= 0)) { + + slab_free_hook_irq(s, x); + + if (likely(page == c->page && c->node != NUMA_NO_NODE)) { set_freepointer(s, object, c->freelist); c->freelist = object; stat(s, FREE_FASTPATH); @@ -2062,26 +2100,18 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) #endif } -static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]); - -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { - if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) - /* - * Boot time creation of the kmalloc array. Use static per cpu data - * since the per cpu allocator is not available yet. - */ - s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches); - else - s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); + BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < + SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); - if (!s->cpu_slab) - return 0; + s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); - return 1; + return s->cpu_slab != NULL; } -#ifdef CONFIG_NUMA +static struct kmem_cache *kmem_cache_node; + /* * No kmalloc_node yet so do it by hand. We know that this is the first * slab on the node for this slabcache. There are no concurrent accesses @@ -2091,15 +2121,15 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) * when allocating for the kmalloc_node_cache. This is used for bootstrapping * memory on a fresh node that has no slab structures yet. */ -static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node) +static void early_kmem_cache_node_alloc(int node) { struct page *page; struct kmem_cache_node *n; unsigned long flags; - BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); + BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); - page = new_slab(kmalloc_caches, gfpflags, node); + page = new_slab(kmem_cache_node, GFP_NOWAIT, node); BUG_ON(!page); if (page_to_nid(page) != node) { @@ -2111,15 +2141,15 @@ static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node) n = page->freelist; BUG_ON(!n); - page->freelist = get_freepointer(kmalloc_caches, n); + page->freelist = get_freepointer(kmem_cache_node, n); page->inuse++; - kmalloc_caches->node[node] = n; + kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG - init_object(kmalloc_caches, n, 1); - init_tracking(kmalloc_caches, n); + init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); + init_tracking(kmem_cache_node, n); #endif - init_kmem_cache_node(n, kmalloc_caches); - inc_slabs_node(kmalloc_caches, node, page->objects); + init_kmem_cache_node(n, kmem_cache_node); + inc_slabs_node(kmem_cache_node, node, page->objects); /* * lockdep requires consistent irq usage for each lock @@ -2137,13 +2167,15 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = s->node[node]; + if (n) - kmem_cache_free(kmalloc_caches, n); + kmem_cache_free(kmem_cache_node, n); + s->node[node] = NULL; } } -static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) +static int init_kmem_cache_nodes(struct kmem_cache *s) { int node; @@ -2151,11 +2183,11 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) struct kmem_cache_node *n; if (slab_state == DOWN) { - early_kmem_cache_node_alloc(gfpflags, node); + early_kmem_cache_node_alloc(node); continue; } - n = kmem_cache_alloc_node(kmalloc_caches, - gfpflags, node); + n = kmem_cache_alloc_node(kmem_cache_node, + GFP_KERNEL, node); if (!n) { free_kmem_cache_nodes(s); @@ -2167,17 +2199,6 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) } return 1; } -#else -static void free_kmem_cache_nodes(struct kmem_cache *s) -{ -} - -static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) -{ - init_kmem_cache_node(&s->local_node, s); - return 1; -} -#endif static void set_min_partial(struct kmem_cache *s, unsigned long min) { @@ -2312,7 +2333,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) } -static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, +static int kmem_cache_open(struct kmem_cache *s, const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) @@ -2348,10 +2369,10 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif - if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) + if (!init_kmem_cache_nodes(s)) goto error; - if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) + if (alloc_kmem_cache_cpus(s)) return 1; free_kmem_cache_nodes(s); @@ -2414,9 +2435,8 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long), - GFP_ATOMIC); - + unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) * + sizeof(long), GFP_ATOMIC); if (!map) return; slab_err(s, page, "%s", text); @@ -2448,9 +2468,8 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { - list_del(&page->lru); + __remove_partial(n, page); discard_slab(s, page); - n->nr_partial--; } else { list_slab_objects(s, page, "Objects remaining on kmem_cache_close()"); @@ -2490,7 +2509,6 @@ void kmem_cache_destroy(struct kmem_cache *s) s->refcount--; if (!s->refcount) { list_del(&s->list); - up_write(&slub_lock); if (kmem_cache_close(s)) { printk(KERN_ERR "SLUB %s: %s called for cache that " "still has objects.\n", s->name, __func__); @@ -2499,8 +2517,8 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); sysfs_slab_remove(s); - } else - up_write(&slub_lock); + } + up_write(&slub_lock); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -2508,9 +2526,15 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned; +struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; EXPORT_SYMBOL(kmalloc_caches); +static struct kmem_cache *kmem_cache; + +#ifdef CONFIG_ZONE_DMA +static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; +#endif + static int __init setup_slub_min_order(char *str) { get_option(&str, &slub_min_order); @@ -2547,116 +2571,29 @@ static int __init setup_slub_nomerge(char *str) __setup("slub_nomerge", setup_slub_nomerge); -static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, - const char *name, int size, gfp_t gfp_flags) +static struct kmem_cache *__init create_kmalloc_cache(const char *name, + int size, unsigned int flags) { - unsigned int flags = 0; + struct kmem_cache *s; - if (gfp_flags & SLUB_DMA) - flags = SLAB_CACHE_DMA; + s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); /* * This function is called with IRQs disabled during early-boot on * single CPU so there's no need to take slub_lock here. */ - if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, + if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, flags, NULL)) goto panic; list_add(&s->list, &slab_caches); - - if (sysfs_slab_add(s)) - goto panic; return s; panic: panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); + return NULL; } -#ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT]; - -static void sysfs_add_func(struct work_struct *w) -{ - struct kmem_cache *s; - - down_write(&slub_lock); - list_for_each_entry(s, &slab_caches, list) { - if (s->flags & __SYSFS_ADD_DEFERRED) { - s->flags &= ~__SYSFS_ADD_DEFERRED; - sysfs_slab_add(s); - } - } - up_write(&slub_lock); -} - -static DECLARE_WORK(sysfs_add_work, sysfs_add_func); - -static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) -{ - struct kmem_cache *s; - char *text; - size_t realsize; - unsigned long slabflags; - int i; - - s = kmalloc_caches_dma[index]; - if (s) - return s; - - /* Dynamically create dma cache */ - if (flags & __GFP_WAIT) - down_write(&slub_lock); - else { - if (!down_write_trylock(&slub_lock)) - goto out; - } - - if (kmalloc_caches_dma[index]) - goto unlock_out; - - realsize = kmalloc_caches[index].objsize; - text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", - (unsigned int)realsize); - - s = NULL; - for (i = 0; i < KMALLOC_CACHES; i++) - if (!kmalloc_caches[i].size) - break; - - BUG_ON(i >= KMALLOC_CACHES); - s = kmalloc_caches + i; - - /* - * Must defer sysfs creation to a workqueue because we don't know - * what context we are called from. Before sysfs comes up, we don't - * need to do anything because our sysfs initcall will start by - * adding all existing slabs to sysfs. - */ - slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK; - if (slab_state >= SYSFS) - slabflags |= __SYSFS_ADD_DEFERRED; - - if (!text || !kmem_cache_open(s, flags, text, - realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { - s->size = 0; - kfree(text); - goto unlock_out; - } - - list_add(&s->list, &slab_caches); - kmalloc_caches_dma[index] = s; - - if (slab_state >= SYSFS) - schedule_work(&sysfs_add_work); - -unlock_out: - up_write(&slub_lock); -out: - return kmalloc_caches_dma[index]; -} -#endif - /* * Conversion table for small slabs sizes / 8 to the index in the * kmalloc array. This is necessary for slabs < 192 since we have non power @@ -2709,10 +2646,10 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) #ifdef CONFIG_ZONE_DMA if (unlikely((flags & SLUB_DMA))) - return dma_kmalloc_cache(index, flags); + return kmalloc_dma_caches[index]; #endif - return &kmalloc_caches[index]; + return kmalloc_caches[index]; } void *__kmalloc(size_t size, gfp_t flags) @@ -2728,7 +2665,7 @@ void *__kmalloc(size_t size, gfp_t flags) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, flags, -1, _RET_IP_); + ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, flags); @@ -2736,6 +2673,7 @@ void *__kmalloc(size_t size, gfp_t flags) } EXPORT_SYMBOL(__kmalloc); +#ifdef CONFIG_NUMA static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { struct page *page; @@ -2750,7 +2688,6 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) return ptr; } -#ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; @@ -2890,8 +2827,7 @@ int kmem_cache_shrink(struct kmem_cache *s) * may have freed the last object and be * waiting to release the slab. */ - list_del(&page->lru); - n->nr_partial--; + __remove_partial(n, page); slab_unlock(page); discard_slab(s, page); } else { @@ -2915,7 +2851,7 @@ int kmem_cache_shrink(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_shrink); -#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) +#if defined(CONFIG_MEMORY_HOTPLUG) static int slab_mem_going_offline_callback(void *arg) { struct kmem_cache *s; @@ -2957,7 +2893,7 @@ static void slab_mem_offline_callback(void *arg) BUG_ON(slabs_node(s, offline_node)); s->node[offline_node] = NULL; - kmem_cache_free(kmalloc_caches, n); + kmem_cache_free(kmem_cache_node, n); } } up_read(&slub_lock); @@ -2990,7 +2926,7 @@ static int slab_mem_going_online_callback(void *arg) * since memory is not yet available from the node that * is brought up. */ - n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL); + n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); if (!n) { ret = -ENOMEM; goto out; @@ -3036,46 +2972,92 @@ static int slab_memory_callback(struct notifier_block *self, * Basic setup of slabs *******************************************************************/ +/* + * Used for early kmem_cache structures that were allocated using + * the page allocator + */ + +static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) +{ + int node; + + list_add(&s->list, &slab_caches); + s->refcount = -1; + + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + struct page *p; + + if (n) { + list_for_each_entry(p, &n->partial, lru) + p->slab = s; + +#ifdef CONFIG_SLAB_DEBUG + list_for_each_entry(p, &n->full, lru) + p->slab = s; +#endif + } + } +} + void __init kmem_cache_init(void) { int i; int caches = 0; + struct kmem_cache *temp_kmem_cache; + int order; + struct kmem_cache *temp_kmem_cache_node; + unsigned long kmalloc_size; + + kmem_size = offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *); + + /* Allocate two kmem_caches from the page allocator */ + kmalloc_size = ALIGN(kmem_size, cache_line_size()); + order = get_order(2 * kmalloc_size); + kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order); -#ifdef CONFIG_NUMA /* * Must first have the slab cache available for the allocations of the * struct kmem_cache_node's. There is special bootstrap code in * kmem_cache_open for slab_state == DOWN. */ - create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", - sizeof(struct kmem_cache_node), GFP_NOWAIT); - kmalloc_caches[0].refcount = -1; - caches++; + kmem_cache_node = (void *)kmem_cache + kmalloc_size; + + kmem_cache_open(kmem_cache_node, "kmem_cache_node", + sizeof(struct kmem_cache_node), + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); -#endif /* Able to allocate the per node structures */ slab_state = PARTIAL; - /* Caches that are not of the two-to-the-power-of size */ - if (KMALLOC_MIN_SIZE <= 32) { - create_kmalloc_cache(&kmalloc_caches[1], - "kmalloc-96", 96, GFP_NOWAIT); - caches++; - } - if (KMALLOC_MIN_SIZE <= 64) { - create_kmalloc_cache(&kmalloc_caches[2], - "kmalloc-192", 192, GFP_NOWAIT); - caches++; - } + temp_kmem_cache = kmem_cache; + kmem_cache_open(kmem_cache, "kmem_cache", kmem_size, + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); + memcpy(kmem_cache, temp_kmem_cache, kmem_size); - for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { - create_kmalloc_cache(&kmalloc_caches[i], - "kmalloc", 1 << i, GFP_NOWAIT); - caches++; - } + /* + * Allocate kmem_cache_node properly from the kmem_cache slab. + * kmem_cache_node is separately allocated so no need to + * update any list pointers. + */ + temp_kmem_cache_node = kmem_cache_node; + + kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); + memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size); + + kmem_cache_bootstrap_fixup(kmem_cache_node); + caches++; + kmem_cache_bootstrap_fixup(kmem_cache); + caches++; + /* Free temporary boot structure */ + free_pages((unsigned long)temp_kmem_cache, order); + + /* Now we can use the kmem_cache to allocate kmalloc slabs */ /* * Patch up the size_index table if we have strange large alignment @@ -3115,23 +3097,60 @@ void __init kmem_cache_init(void) size_index[size_index_elem(i)] = 8; } + /* Caches that are not of the two-to-the-power-of size */ + if (KMALLOC_MIN_SIZE <= 32) { + kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0); + caches++; + } + + if (KMALLOC_MIN_SIZE <= 64) { + kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0); + caches++; + } + + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { + kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0); + caches++; + } + slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) - kmalloc_caches[i]. name = - kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); + if (KMALLOC_MIN_SIZE <= 32) { + kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT); + BUG_ON(!kmalloc_caches[1]->name); + } + + if (KMALLOC_MIN_SIZE <= 64) { + kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT); + BUG_ON(!kmalloc_caches[2]->name); + } + + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { + char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); + + BUG_ON(!s); + kmalloc_caches[i]->name = s; + } #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); #endif -#ifdef CONFIG_NUMA - kmem_size = offsetof(struct kmem_cache, node) + - nr_node_ids * sizeof(struct kmem_cache_node *); -#else - kmem_size = sizeof(struct kmem_cache); -#endif +#ifdef CONFIG_ZONE_DMA + for (i = 0; i < SLUB_PAGE_SHIFT; i++) { + struct kmem_cache *s = kmalloc_caches[i]; + + if (s && s->size) { + char *name = kasprintf(GFP_NOWAIT, + "dma-kmalloc-%d", s->objsize); + + BUG_ON(!name); + kmalloc_dma_caches[i] = create_kmalloc_cache(name, + s->objsize, SLAB_CACHE_DMA); + } + } +#endif printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," " CPUs=%d, Nodes=%d\n", @@ -3209,6 +3228,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; + char *n; if (WARN_ON(!name)) return NULL; @@ -3223,32 +3243,34 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, */ s->objsize = max(s->objsize, (int)size); s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); - up_write(&slub_lock); if (sysfs_slab_alias(s, name)) { - down_write(&slub_lock); s->refcount--; - up_write(&slub_lock); goto err; } + up_write(&slub_lock); return s; } + n = kstrdup(name, GFP_KERNEL); + if (!n) + goto err; + s = kmalloc(kmem_size, GFP_KERNEL); if (s) { - if (kmem_cache_open(s, GFP_KERNEL, name, + if (kmem_cache_open(s, n, size, align, flags, ctor)) { list_add(&s->list, &slab_caches); - up_write(&slub_lock); if (sysfs_slab_add(s)) { - down_write(&slub_lock); list_del(&s->list); - up_write(&slub_lock); + kfree(n); kfree(s); goto err; } + up_write(&slub_lock); return s; } + kfree(n); kfree(s); } up_write(&slub_lock); @@ -3312,7 +3334,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, gfpflags, -1, caller); + ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); /* Honor the call site pointer we recieved. */ trace_kmalloc(caller, ret, size, s->size, gfpflags); @@ -3320,6 +3342,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) return ret; } +#ifdef CONFIG_NUMA void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, int node, unsigned long caller) { @@ -3348,8 +3371,9 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, return ret; } +#endif -#ifdef CONFIG_SLUB_DEBUG +#ifdef CONFIG_SYSFS static int count_inuse(struct page *page) { return page->inuse; @@ -3359,7 +3383,9 @@ static int count_total(struct page *page) { return page->objects; } +#endif +#ifdef CONFIG_SLUB_DEBUG static int validate_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { @@ -3395,16 +3421,6 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page, } else printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", s->name, page); - - if (s->flags & DEBUG_DEFAULT_FLAGS) { - if (!PageSlubDebug(page)) - printk(KERN_ERR "SLUB %s: SlubDebug not set " - "on slab 0x%p\n", s->name, page); - } else { - if (PageSlubDebug(page)) - printk(KERN_ERR "SLUB %s: SlubDebug set on " - "slab 0x%p\n", s->name, page); - } } static int validate_slab_node(struct kmem_cache *s, @@ -3460,65 +3476,6 @@ static long validate_slab_cache(struct kmem_cache *s) kfree(map); return count; } - -#ifdef SLUB_RESILIENCY_TEST -static void resiliency_test(void) -{ - u8 *p; - - printk(KERN_ERR "SLUB resiliency testing\n"); - printk(KERN_ERR "-----------------------\n"); - printk(KERN_ERR "A. Corruption after allocation\n"); - - p = kzalloc(16, GFP_KERNEL); - p[16] = 0x12; - printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" - " 0x12->0x%p\n\n", p + 16); - - validate_slab_cache(kmalloc_caches + 4); - - /* Hmmm... The next two are dangerous */ - p = kzalloc(32, GFP_KERNEL); - p[32 + sizeof(void *)] = 0x34; - printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" - " 0x34 -> -0x%p\n", p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); - - validate_slab_cache(kmalloc_caches + 5); - p = kzalloc(64, GFP_KERNEL); - p += 64 + (get_cycles() & 0xff) * sizeof(void *); - *p = 0x56; - printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", - p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches + 6); - - printk(KERN_ERR "\nB. Corruption after free\n"); - p = kzalloc(128, GFP_KERNEL); - kfree(p); - *p = 0x78; - printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches + 7); - - p = kzalloc(256, GFP_KERNEL); - kfree(p); - p[50] = 0x9a; - printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", - p); - validate_slab_cache(kmalloc_caches + 8); - - p = kzalloc(512, GFP_KERNEL); - kfree(p); - p[512] = 0xab; - printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches + 9); -} -#else -static void resiliency_test(void) {}; -#endif - /* * Generate lists of code addresses where slabcache objects are allocated * and freed. @@ -3647,7 +3604,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, static void process_slab(struct loc_track *t, struct kmem_cache *s, struct page *page, enum track_item alloc, - long *map) + unsigned long *map) { void *addr = page_address(page); void *p; @@ -3747,7 +3704,71 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf, "No data\n"); return len; } +#endif + +#ifdef SLUB_RESILIENCY_TEST +static void resiliency_test(void) +{ + u8 *p; + + BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10); + + printk(KERN_ERR "SLUB resiliency testing\n"); + printk(KERN_ERR "-----------------------\n"); + printk(KERN_ERR "A. Corruption after allocation\n"); + + p = kzalloc(16, GFP_KERNEL); + p[16] = 0x12; + printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" + " 0x12->0x%p\n\n", p + 16); + + validate_slab_cache(kmalloc_caches[4]); + + /* Hmmm... The next two are dangerous */ + p = kzalloc(32, GFP_KERNEL); + p[32 + sizeof(void *)] = 0x34; + printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" + " 0x34 -> -0x%p\n", p); + printk(KERN_ERR + "If allocated object is overwritten then not detectable\n\n"); + + validate_slab_cache(kmalloc_caches[5]); + p = kzalloc(64, GFP_KERNEL); + p += 64 + (get_cycles() & 0xff) * sizeof(void *); + *p = 0x56; + printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", + p); + printk(KERN_ERR + "If allocated object is overwritten then not detectable\n\n"); + validate_slab_cache(kmalloc_caches[6]); + + printk(KERN_ERR "\nB. Corruption after free\n"); + p = kzalloc(128, GFP_KERNEL); + kfree(p); + *p = 0x78; + printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches[7]); + + p = kzalloc(256, GFP_KERNEL); + kfree(p); + p[50] = 0x9a; + printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", + p); + validate_slab_cache(kmalloc_caches[8]); + + p = kzalloc(512, GFP_KERNEL); + kfree(p); + p[512] = 0xab; + printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches[9]); +} +#else +#ifdef CONFIG_SYSFS +static void resiliency_test(void) {}; +#endif +#endif +#ifdef CONFIG_SYSFS enum slab_stat_type { SL_ALL, /* All slabs */ SL_PARTIAL, /* Only partially allocated slabs */ @@ -3800,6 +3821,8 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } } + down_read(&slub_lock); +#ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -3816,7 +3839,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, nodes[node] += x; } - } else if (flags & SO_PARTIAL) { + } else +#endif + if (flags & SO_PARTIAL) { for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -3841,6 +3866,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, return x + sprintf(buf + x, "\n"); } +#ifdef CONFIG_SLUB_DEBUG static int any_slab_objects(struct kmem_cache *s) { int node; @@ -3856,6 +3882,7 @@ static int any_slab_objects(struct kmem_cache *s) } return 0; } +#endif #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) #define to_slab(n) container_of(n, struct kmem_cache, kobj); @@ -3957,12 +3984,6 @@ static ssize_t aliases_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(aliases); -static ssize_t slabs_show(struct kmem_cache *s, char *buf) -{ - return show_slab_objects(s, buf, SO_ALL); -} -SLAB_ATTR_RO(slabs); - static ssize_t partial_show(struct kmem_cache *s, char *buf) { return show_slab_objects(s, buf, SO_PARTIAL); @@ -3987,93 +4008,83 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(objects_partial); -static ssize_t total_objects_show(struct kmem_cache *s, char *buf) -{ - return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); -} -SLAB_ATTR_RO(total_objects); - -static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) +static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); } -static ssize_t sanity_checks_store(struct kmem_cache *s, +static ssize_t reclaim_account_store(struct kmem_cache *s, const char *buf, size_t length) { - s->flags &= ~SLAB_DEBUG_FREE; + s->flags &= ~SLAB_RECLAIM_ACCOUNT; if (buf[0] == '1') - s->flags |= SLAB_DEBUG_FREE; + s->flags |= SLAB_RECLAIM_ACCOUNT; return length; } -SLAB_ATTR(sanity_checks); +SLAB_ATTR(reclaim_account); -static ssize_t trace_show(struct kmem_cache *s, char *buf) +static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); } +SLAB_ATTR_RO(hwcache_align); -static ssize_t trace_store(struct kmem_cache *s, const char *buf, - size_t length) +#ifdef CONFIG_ZONE_DMA +static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) { - s->flags &= ~SLAB_TRACE; - if (buf[0] == '1') - s->flags |= SLAB_TRACE; - return length; + return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); } -SLAB_ATTR(trace); +SLAB_ATTR_RO(cache_dma); +#endif -#ifdef CONFIG_FAILSLAB -static ssize_t failslab_show(struct kmem_cache *s, char *buf) +static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); } +SLAB_ATTR_RO(destroy_by_rcu); -static ssize_t failslab_store(struct kmem_cache *s, const char *buf, - size_t length) +#ifdef CONFIG_SLUB_DEBUG +static ssize_t slabs_show(struct kmem_cache *s, char *buf) { - s->flags &= ~SLAB_FAILSLAB; - if (buf[0] == '1') - s->flags |= SLAB_FAILSLAB; - return length; + return show_slab_objects(s, buf, SO_ALL); } -SLAB_ATTR(failslab); -#endif +SLAB_ATTR_RO(slabs); -static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) +static ssize_t total_objects_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); + return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); } +SLAB_ATTR_RO(total_objects); -static ssize_t reclaim_account_store(struct kmem_cache *s, - const char *buf, size_t length) +static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { - s->flags &= ~SLAB_RECLAIM_ACCOUNT; - if (buf[0] == '1') - s->flags |= SLAB_RECLAIM_ACCOUNT; - return length; + return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); } -SLAB_ATTR(reclaim_account); -static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) +static ssize_t sanity_checks_store(struct kmem_cache *s, + const char *buf, size_t length) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); + s->flags &= ~SLAB_DEBUG_FREE; + if (buf[0] == '1') + s->flags |= SLAB_DEBUG_FREE; + return length; } -SLAB_ATTR_RO(hwcache_align); +SLAB_ATTR(sanity_checks); -#ifdef CONFIG_ZONE_DMA -static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) +static ssize_t trace_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); } -SLAB_ATTR_RO(cache_dma); -#endif -static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) +static ssize_t trace_store(struct kmem_cache *s, const char *buf, + size_t length) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); + s->flags &= ~SLAB_TRACE; + if (buf[0] == '1') + s->flags |= SLAB_TRACE; + return length; } -SLAB_ATTR_RO(destroy_by_rcu); +SLAB_ATTR(trace); static ssize_t red_zone_show(struct kmem_cache *s, char *buf) { @@ -4151,6 +4162,40 @@ static ssize_t validate_store(struct kmem_cache *s, } SLAB_ATTR(validate); +static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_ALLOC); +} +SLAB_ATTR_RO(alloc_calls); + +static ssize_t free_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_FREE); +} +SLAB_ATTR_RO(free_calls); +#endif /* CONFIG_SLUB_DEBUG */ + +#ifdef CONFIG_FAILSLAB +static ssize_t failslab_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); +} + +static ssize_t failslab_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + s->flags &= ~SLAB_FAILSLAB; + if (buf[0] == '1') + s->flags |= SLAB_FAILSLAB; + return length; +} +SLAB_ATTR(failslab); +#endif + static ssize_t shrink_show(struct kmem_cache *s, char *buf) { return 0; @@ -4170,22 +4215,6 @@ static ssize_t shrink_store(struct kmem_cache *s, } SLAB_ATTR(shrink); -static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) -{ - if (!(s->flags & SLAB_STORE_USER)) - return -ENOSYS; - return list_locations(s, buf, TRACK_ALLOC); -} -SLAB_ATTR_RO(alloc_calls); - -static ssize_t free_calls_show(struct kmem_cache *s, char *buf) -{ - if (!(s->flags & SLAB_STORE_USER)) - return -ENOSYS; - return list_locations(s, buf, TRACK_FREE); -} -SLAB_ATTR_RO(free_calls); - #ifdef CONFIG_NUMA static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) { @@ -4291,25 +4320,27 @@ static struct attribute *slab_attrs[] = { &min_partial_attr.attr, &objects_attr.attr, &objects_partial_attr.attr, - &total_objects_attr.attr, - &slabs_attr.attr, &partial_attr.attr, &cpu_slabs_attr.attr, &ctor_attr.attr, &aliases_attr.attr, &align_attr.attr, - &sanity_checks_attr.attr, - &trace_attr.attr, &hwcache_align_attr.attr, &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, + &shrink_attr.attr, +#ifdef CONFIG_SLUB_DEBUG + &total_objects_attr.attr, + &slabs_attr.attr, + &sanity_checks_attr.attr, + &trace_attr.attr, &red_zone_attr.attr, &poison_attr.attr, &store_user_attr.attr, &validate_attr.attr, - &shrink_attr.attr, &alloc_calls_attr.attr, &free_calls_attr.attr, +#endif #ifdef CONFIG_ZONE_DMA &cache_dma_attr.attr, #endif @@ -4389,6 +4420,7 @@ static void kmem_cache_release(struct kobject *kobj) { struct kmem_cache *s = to_slab(kobj); + kfree(s->name); kfree(s); } @@ -4504,6 +4536,13 @@ static int sysfs_slab_add(struct kmem_cache *s) static void sysfs_slab_remove(struct kmem_cache *s) { + if (slab_state < SYSFS) + /* + * Sysfs has not been setup yet so no need to remove the + * cache from sysfs. + */ + return; + kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); kobject_put(&s->kobj); @@ -4549,8 +4588,11 @@ static int __init slab_sysfs_init(void) struct kmem_cache *s; int err; + down_write(&slub_lock); + slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); if (!slab_kset) { + up_write(&slub_lock); printk(KERN_ERR "Cannot register slab subsystem.\n"); return -ENOSYS; } @@ -4575,12 +4617,13 @@ static int __init slab_sysfs_init(void) kfree(al); } + up_write(&slub_lock); resiliency_test(); return 0; } __initcall(slab_sysfs_init); -#endif +#endif /* CONFIG_SYSFS */ /* * The /proc/slabinfo ABI diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index aa33fd67fa41..29d6cbffb283 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -220,18 +220,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (vmemmap_buf_start) { /* need to free left buf */ -#ifdef CONFIG_NO_BOOTMEM - free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end)); - if (vmemmap_buf_start < vmemmap_buf) { - char name[15]; - - snprintf(name, sizeof(name), "MEMMAP %d", nodeid); - reserve_early_without_check(__pa(vmemmap_buf_start), - __pa(vmemmap_buf), name); - } -#else free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); -#endif vmemmap_buf = NULL; vmemmap_buf_end = NULL; } diff --git a/mm/swap.c b/mm/swap.c index 3ce7bc373a52..3f4854205b16 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -378,6 +378,7 @@ void release_pages(struct page **pages, int nr, int cold) pagevec_free(&pages_to_free); } +EXPORT_SYMBOL(release_pages); /* * The pages which we're about to release may be in the deferred lru-addition diff --git a/mm/swapfile.c b/mm/swapfile.c index 03aa2d55f1a2..67ddaaf98c74 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -30,6 +30,7 @@ #include <linux/capability.h> #include <linux/syscalls.h> #include <linux/memcontrol.h> +#include <linux/poll.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -58,6 +59,10 @@ static struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); +static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); +/* Activity counter to indicate that a swapon or swapoff has occurred */ +static atomic_t proc_poll_event = ATOMIC_INIT(0); + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ @@ -139,8 +144,7 @@ static int discard_swap(struct swap_info_struct *si) nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) { err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, - BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); + nr_blocks, GFP_KERNEL, 0); if (err) return err; cond_resched(); @@ -151,8 +155,7 @@ static int discard_swap(struct swap_info_struct *si) nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, - BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); + nr_blocks, GFP_KERNEL, 0); if (err) break; @@ -191,8 +194,7 @@ static void discard_swap_cluster(struct swap_info_struct *si, start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT | - BLKDEV_IFL_BARRIER)) + nr_blocks, GFP_NOIO, 0)) break; } @@ -686,6 +688,24 @@ int try_to_free_swap(struct page *page) if (page_swapcount(page)) return 0; + /* + * Once hibernation has begun to create its image of memory, + * there's a danger that one of the calls to try_to_free_swap() + * - most probably a call from __try_to_reclaim_swap() while + * hibernation is allocating its own swap pages for the image, + * but conceivably even a call from memory reclaim - will free + * the swap from a page which has already been recorded in the + * image as a clean swapcache page, and then reuse its swap for + * another page of the image. On waking from hibernation, the + * original page might be freed under memory pressure, then + * later read back in from swap, now with the wrong data. + * + * Hibernation clears bits from gfp_allowed_mask to prevent + * memory reclaim from writing to disk, so check that here. + */ + if (!(gfp_allowed_mask & __GFP_IO)) + return 0; + delete_from_swap_cache(page); SetPageDirty(page); return 1; @@ -1665,6 +1685,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) } filp_close(swap_file, NULL); err = 0; + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); out_dput: filp_close(victim, NULL); @@ -1673,6 +1695,25 @@ out: } #ifdef CONFIG_PROC_FS +struct proc_swaps { + struct seq_file seq; + int event; +}; + +static unsigned swaps_poll(struct file *file, poll_table *wait) +{ + struct proc_swaps *s = file->private_data; + + poll_wait(file, &proc_poll_wait, wait); + + if (s->event != atomic_read(&proc_poll_event)) { + s->event = atomic_read(&proc_poll_event); + return POLLIN | POLLRDNORM | POLLERR | POLLPRI; + } + + return POLLIN | POLLRDNORM; +} + /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) { @@ -1756,7 +1797,24 @@ static const struct seq_operations swaps_op = { static int swaps_open(struct inode *inode, struct file *file) { - return seq_open(file, &swaps_op); + struct proc_swaps *s; + int ret; + + s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL); + if (!s) + return -ENOMEM; + + file->private_data = s; + + ret = seq_open(file, &swaps_op); + if (ret) { + kfree(s); + return ret; + } + + s->seq.private = s; + s->event = atomic_read(&proc_poll_event); + return ret; } static const struct file_operations proc_swaps_operations = { @@ -1764,6 +1822,7 @@ static const struct file_operations proc_swaps_operations = { .read = seq_read, .llseek = seq_lseek, .release = seq_release, + .poll = swaps_poll, }; static int __init procswaps_init(void) @@ -2032,7 +2091,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->flags |= SWP_SOLIDSTATE; p->cluster_next = 1 + (random32() % p->highest_bit); } - if (discard_swap(p) == 0) + if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD)) p->flags |= SWP_DISCARDABLE; } @@ -2069,6 +2128,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) swap_info[prev]->next = type; spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); + error = 0; goto out; bad_swap: diff --git a/mm/truncate.c b/mm/truncate.c index 937571b8b233..ba887bff48c5 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -541,28 +541,48 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) EXPORT_SYMBOL(truncate_pagecache); /** + * truncate_setsize - update inode and pagecache for a new file size + * @inode: inode + * @newsize: new file size + * + * truncate_setsize updastes i_size update and performs pagecache + * truncation (if necessary) for a file size updates. It will be + * typically be called from the filesystem's setattr function when + * ATTR_SIZE is passed in. + * + * Must be called with inode_mutex held and after all filesystem + * specific block truncation has been performed. + */ +void truncate_setsize(struct inode *inode, loff_t newsize) +{ + loff_t oldsize; + + oldsize = inode->i_size; + i_size_write(inode, newsize); + + truncate_pagecache(inode, oldsize, newsize); +} +EXPORT_SYMBOL(truncate_setsize); + +/** * vmtruncate - unmap mappings "freed" by truncate() syscall * @inode: inode of the file used * @offset: file offset to start truncating * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - * - * This function is deprecated and simple_setsize or truncate_pagecache - * should be used instead. + * This function is deprecated and truncate_setsize or truncate_pagecache + * should be used instead, together with filesystem specific block truncation. */ int vmtruncate(struct inode *inode, loff_t offset) { int error; - error = simple_setsize(inode, offset); + error = inode_newsize_ok(inode, offset); if (error) return error; + truncate_setsize(inode, offset); if (inode->i_op->truncate) inode->i_op->truncate(inode); - - return error; + return 0; } EXPORT_SYMBOL(vmtruncate); diff --git a/mm/util.c b/mm/util.c index f5712e8964be..73dac81e9f78 100644 --- a/mm/util.c +++ b/mm/util.c @@ -225,15 +225,10 @@ char *strndup_user(const char __user *s, long n) if (length > n) return ERR_PTR(-EINVAL); - p = kmalloc(length, GFP_KERNEL); + p = memdup_user(s, length); - if (!p) - return ERR_PTR(-ENOMEM); - - if (copy_from_user(p, s, length)) { - kfree(p); - return ERR_PTR(-EFAULT); - } + if (IS_ERR(p)) + return p; p[length - 1] = '\0'; @@ -250,6 +245,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm) } #endif +/* + * Like get_user_pages_fast() except its IRQ-safe in that it won't fall + * back to the regular GUP. + * If the architecture not support this fucntion, simply return with no + * page pinned + */ +int __attribute__((weak)) __get_user_pages_fast(unsigned long start, + int nr_pages, int write, struct page **pages) +{ + return 0; +} +EXPORT_SYMBOL_GPL(__get_user_pages_fast); + /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ae007462b7f6..a3d66b3dc5cb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -31,6 +31,7 @@ #include <asm/tlbflush.h> #include <asm/shmparam.h> +bool vmap_lazy_unmap __read_mostly = true; /*** Page table manipulation functions ***/ @@ -292,13 +293,13 @@ static void __insert_vmap_area(struct vmap_area *va) struct rb_node *tmp; while (*p) { - struct vmap_area *tmp; + struct vmap_area *tmp_va; parent = *p; - tmp = rb_entry(parent, struct vmap_area, rb_node); - if (va->va_start < tmp->va_end) + tmp_va = rb_entry(parent, struct vmap_area, rb_node); + if (va->va_start < tmp_va->va_end) p = &(*p)->rb_left; - else if (va->va_end > tmp->va_start) + else if (va->va_end > tmp_va->va_start) p = &(*p)->rb_right; else BUG(); @@ -502,6 +503,9 @@ static unsigned long lazy_max_pages(void) { unsigned int log; + if (!vmap_lazy_unmap) + return 0; + log = fls(num_online_cpus()); return log * (32UL * 1024 * 1024 / PAGE_SIZE); @@ -513,6 +517,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); static void purge_fragmented_blocks_allcpus(void); /* + * called before a call to iounmap() if the caller wants vm_area_struct's + * immediately freed. + */ +void set_iounmap_nonlazy(void) +{ + atomic_set(&vmap_lazy_nr, lazy_max_pages()+1); +} + +/* * Purges all lazily-freed vmap areas. * * If sync is 0 then don't purge if there is already a purge in progress. @@ -732,7 +745,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) node, gfp_mask); if (unlikely(IS_ERR(va))) { kfree(vb); - return ERR_PTR(PTR_ERR(va)); + return ERR_CAST(va); } err = radix_tree_preload(gfp_mask); @@ -1583,6 +1596,13 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); +static inline void *__vmalloc_node_flags(unsigned long size, + int node, gfp_t flags) +{ + return __vmalloc_node(size, 1, flags, PAGE_KERNEL, + node, __builtin_return_address(0)); +} + /** * vmalloc - allocate virtually contiguous memory * @size: allocation size @@ -1594,12 +1614,28 @@ EXPORT_SYMBOL(__vmalloc); */ void *vmalloc(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, - -1, __builtin_return_address(0)); + return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM); } EXPORT_SYMBOL(vmalloc); /** + * vzalloc - allocate virtually contiguous memory with zero fill + * @size: allocation size + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc(unsigned long size) +{ + return __vmalloc_node_flags(size, -1, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); +} +EXPORT_SYMBOL(vzalloc); + +/** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace * @size: allocation size * @@ -1640,6 +1676,25 @@ void *vmalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vmalloc_node); +/** + * vzalloc_node - allocate memory on a specific node with zero fill + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc_node() instead. + */ +void *vzalloc_node(unsigned long size, int node) +{ + return __vmalloc_node_flags(size, node, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); +} +EXPORT_SYMBOL(vzalloc_node); + #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif @@ -2052,6 +2107,7 @@ void free_vm_area(struct vm_struct *area) } EXPORT_SYMBOL_GPL(free_vm_area); +#ifdef CONFIG_SMP static struct vmap_area *node_to_va(struct rb_node *n) { return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; @@ -2332,9 +2388,11 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) free_vm_area(vms[i]); kfree(vms); } +#endif /* CONFIG_SMP */ #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) + __acquires(&vmlist_lock) { loff_t n = *pos; struct vm_struct *v; @@ -2361,6 +2419,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) } static void s_stop(struct seq_file *m, void *p) + __releases(&vmlist_lock) { read_unlock(&vmlist_lock); } @@ -2403,7 +2462,7 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, " pages=%d", v->nr_pages); if (v->phys_addr) - seq_printf(m, " phys=%lx", v->phys_addr); + seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); if (v->flags & VM_IOREMAP) seq_printf(m, " ioremap"); @@ -2437,8 +2496,11 @@ static int vmalloc_open(struct inode *inode, struct file *file) unsigned int *ptr = NULL; int ret; - if (NUMA_BUILD) + if (NUMA_BUILD) { ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); + if (ptr == NULL) + return -ENOMEM; + } ret = seq_open(file, &vmalloc_op); if (!ret) { struct seq_file *m = file->private_data; diff --git a/mm/vmscan.c b/mm/vmscan.c index b94fe1b3da43..b8a6fdc21312 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -48,6 +48,15 @@ #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/vmscan.h> + +enum lumpy_mode { + LUMPY_MODE_NONE, + LUMPY_MODE_ASYNC, + LUMPY_MODE_SYNC, +}; + struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -76,10 +85,10 @@ struct scan_control { int order; /* - * Intend to reclaim enough contenious memory rather than to reclaim - * enough amount memory. I.e, it's the mode for high order allocation. + * Intend to reclaim enough continuous memory rather than reclaim + * enough amount of memory. i.e, mode for high order allocation. */ - bool lumpy_reclaim_mode; + enum lumpy_mode lumpy_reclaim_mode; /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -262,6 +271,36 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, return ret; } +static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, + bool sync) +{ + enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; + + /* + * Some reclaim have alredy been failed. No worth to try synchronous + * lumpy reclaim. + */ + if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) + return; + + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + sc->lumpy_reclaim_mode = mode; + else if (sc->order && priority < DEF_PRIORITY - 2) + sc->lumpy_reclaim_mode = mode; + else + sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; +} + +static void disable_lumpy_reclaim_mode(struct scan_control *sc) +{ + sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; +} + static inline int is_page_cache_freeable(struct page *page) { /* @@ -272,7 +311,8 @@ static inline int is_page_cache_freeable(struct page *page) return page_count(page) - page_has_private(page) == 2; } -static int may_write_to_queue(struct backing_dev_info *bdi) +static int may_write_to_queue(struct backing_dev_info *bdi, + struct scan_control *sc) { if (current->flags & PF_SWAPWRITE) return 1; @@ -280,6 +320,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi) return 1; if (bdi == current->backing_dev_info) return 1; + + /* lumpy reclaim for hugepage often need a lot of write */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + return 1; return 0; } @@ -304,12 +348,6 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } -/* Request for sync pageout. */ -enum pageout_io { - PAGEOUT_IO_ASYNC, - PAGEOUT_IO_SYNC, -}; - /* possible outcome of pageout() */ typedef enum { /* failed to write page out, page is locked */ @@ -327,7 +365,7 @@ typedef enum { * Calls ->writepage(). */ static pageout_t pageout(struct page *page, struct address_space *mapping, - enum pageout_io sync_writeback) + struct scan_control *sc) { /* * If the page is dirty, only perform writeback if that write @@ -363,7 +401,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(mapping->backing_dev_info)) + if (!may_write_to_queue(mapping->backing_dev_info, sc)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { @@ -373,7 +411,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, .nr_to_write = SWAP_CLUSTER_MAX, .range_start = 0, .range_end = LLONG_MAX, - .nonblocking = 1, .for_reclaim = 1, }; @@ -391,13 +428,16 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * direct reclaiming a large contiguous area and the * first attempt to free a range of pages fails. */ - if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) + if (PageWriteback(page) && + sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) wait_on_page_writeback(page); if (!PageWriteback(page)) { /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } + trace_mm_vmscan_writepage(page, + trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -575,7 +615,7 @@ static enum page_references page_check_references(struct page *page, referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ - if (sc->lumpy_reclaim_mode) + if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) return PAGEREF_RECLAIM; /* @@ -611,27 +651,46 @@ static enum page_references page_check_references(struct page *page, } /* Reclaim if clean, defer dirty pages to writeback */ - if (referenced_page) + if (referenced_page && !PageSwapBacked(page)) return PAGEREF_RECLAIM_CLEAN; return PAGEREF_RECLAIM; } +static noinline_for_stack void free_page_list(struct list_head *free_pages) +{ + struct pagevec freed_pvec; + struct page *page, *tmp; + + pagevec_init(&freed_pvec, 1); + + list_for_each_entry_safe(page, tmp, free_pages, lru) { + list_del(&page->lru); + if (!pagevec_add(&freed_pvec, page)) { + __pagevec_free(&freed_pvec); + pagevec_reinit(&freed_pvec); + } + } + + pagevec_free(&freed_pvec); +} + /* * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct scan_control *sc, - enum pageout_io sync_writeback) + struct zone *zone, + struct scan_control *sc) { LIST_HEAD(ret_pages); - struct pagevec freed_pvec; + LIST_HEAD(free_pages); int pgactivate = 0; + unsigned long nr_dirty = 0; + unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; cond_resched(); - pagevec_init(&freed_pvec, 1); while (!list_empty(page_list)) { enum page_references references; struct address_space *mapping; @@ -647,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep; VM_BUG_ON(PageActive(page)); + VM_BUG_ON(page_zone(page) != zone); sc->nr_scanned++; @@ -672,10 +732,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, * for any page for which writeback has already * started. */ - if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) + if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && + may_enter_fs) wait_on_page_writeback(page); - else - goto keep_locked; + else { + unlock_page(page); + goto keep_lumpy; + } } references = page_check_references(page, sc); @@ -721,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (PageDirty(page)) { + nr_dirty++; + if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs) @@ -729,14 +794,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; /* Page is dirty, try to write it out here */ - switch (pageout(page, mapping, sync_writeback)) { + switch (pageout(page, mapping, sc)) { case PAGE_KEEP: + nr_congested++; goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: - if (PageWriteback(page) || PageDirty(page)) + if (PageWriteback(page)) + goto keep_lumpy; + if (PageDirty(page)) goto keep; + /* * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the page. @@ -806,10 +875,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, __clear_page_locked(page); free_it: nr_reclaimed++; - if (!pagevec_add(&freed_pvec, page)) { - __pagevec_free(&freed_pvec); - pagevec_reinit(&freed_pvec); - } + + /* + * Is there need to periodically free_page_list? It would + * appear not as the counts should be low + */ + list_add(&page->lru, &free_pages); continue; cull_mlocked: @@ -817,6 +888,7 @@ cull_mlocked: try_to_free_swap(page); unlock_page(page); putback_lru_page(page); + disable_lumpy_reclaim_mode(sc); continue; activate_locked: @@ -829,12 +901,24 @@ activate_locked: keep_locked: unlock_page(page); keep: + disable_lumpy_reclaim_mode(sc); +keep_lumpy: list_add(&page->lru, &ret_pages); VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } + + /* + * Tag a zone as congested if all the dirty pages encountered were + * backed by a congested BDI. In this case, reclaimers should just + * back off and wait for congestion to clear because further reclaim + * will encounter the same problem + */ + if (nr_dirty == nr_congested) + zone_set_flag(zone, ZONE_CONGESTED); + + free_page_list(&free_pages); + list_splice(&ret_pages, page_list); - if (pagevec_count(&freed_pvec)) - __pagevec_free(&freed_pvec); count_vm_events(PGACTIVATE, pgactivate); return nr_reclaimed; } @@ -916,6 +1000,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long *scanned, int order, int mode, int file) { unsigned long nr_taken = 0; + unsigned long nr_lumpy_taken = 0; + unsigned long nr_lumpy_dirty = 0; + unsigned long nr_lumpy_failed = 0; unsigned long scan; for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { @@ -978,7 +1065,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) - continue; + break; /* * If we don't have enough swap space, reclaiming of @@ -986,19 +1073,37 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * pointless. */ if (nr_swap_pages <= 0 && PageAnon(cursor_page) && - !PageSwapCache(cursor_page)) - continue; + !PageSwapCache(cursor_page)) + break; if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); mem_cgroup_del_lru(cursor_page); nr_taken++; + nr_lumpy_taken++; + if (PageDirty(cursor_page)) + nr_lumpy_dirty++; scan++; + } else { + /* the page is freed already. */ + if (!page_count(cursor_page)) + continue; + break; } } + + /* If we break out of the loop above, lumpy reclaim failed */ + if (pfn < end_pfn) + nr_lumpy_failed++; } *scanned = scan; + + trace_mm_vmscan_lru_isolate(order, + nr_to_scan, scan, + nr_taken, + nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, + mode); return nr_taken; } @@ -1035,7 +1140,8 @@ static unsigned long clear_active_flags(struct list_head *page_list, ClearPageActive(page); nr_active++; } - count[lru]++; + if (count) + count[lru]++; } return nr_active; @@ -1112,174 +1218,209 @@ static int too_many_isolated(struct zone *zone, int file, } /* - * shrink_inactive_list() is a helper for shrink_zone(). It returns the number - * of reclaimed pages + * TODO: Try merging with migrations version of putback_lru_pages */ -static unsigned long shrink_inactive_list(unsigned long max_scan, - struct zone *zone, struct scan_control *sc, - int priority, int file) +static noinline_for_stack void +putback_lru_pages(struct zone *zone, struct scan_control *sc, + unsigned long nr_anon, unsigned long nr_file, + struct list_head *page_list) { - LIST_HEAD(page_list); + struct page *page; struct pagevec pvec; - unsigned long nr_scanned = 0; - unsigned long nr_reclaimed = 0; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - while (unlikely(too_many_isolated(zone, file, sc))) { - congestion_wait(BLK_RW_ASYNC, HZ/10); + pagevec_init(&pvec, 1); - /* We are about to die and free our memory. Return now. */ - if (fatal_signal_pending(current)) - return SWAP_CLUSTER_MAX; + /* + * Put back any unfreeable pages. + */ + spin_lock(&zone->lru_lock); + while (!list_empty(page_list)) { + int lru; + page = lru_to_page(page_list); + VM_BUG_ON(PageLRU(page)); + list_del(&page->lru); + if (unlikely(!page_evictable(page, NULL))) { + spin_unlock_irq(&zone->lru_lock); + putback_lru_page(page); + spin_lock_irq(&zone->lru_lock); + continue; + } + SetPageLRU(page); + lru = page_lru(page); + add_page_to_lru_list(zone, page, lru); + if (is_active_lru(lru)) { + int file = is_file_lru(lru); + reclaim_stat->recent_rotated[file]++; + } + if (!pagevec_add(&pvec, page)) { + spin_unlock_irq(&zone->lru_lock); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } } + __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + + spin_unlock_irq(&zone->lru_lock); + pagevec_release(&pvec); +} +static noinline_for_stack void update_isolated_counts(struct zone *zone, + struct scan_control *sc, + unsigned long *nr_anon, + unsigned long *nr_file, + struct list_head *isolated_list) +{ + unsigned long nr_active; + unsigned int count[NR_LRU_LISTS] = { 0, }; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - pagevec_init(&pvec, 1); + nr_active = clear_active_flags(isolated_list, count); + __count_vm_events(PGDEACTIVATE, nr_active); - lru_add_drain(); - spin_lock_irq(&zone->lru_lock); - do { - struct page *page; - unsigned long nr_taken; - unsigned long nr_scan; - unsigned long nr_freed; - unsigned long nr_active; - unsigned int count[NR_LRU_LISTS] = { 0, }; - int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE; - unsigned long nr_anon; - unsigned long nr_file; + __mod_zone_page_state(zone, NR_ACTIVE_FILE, + -count[LRU_ACTIVE_FILE]); + __mod_zone_page_state(zone, NR_INACTIVE_FILE, + -count[LRU_INACTIVE_FILE]); + __mod_zone_page_state(zone, NR_ACTIVE_ANON, + -count[LRU_ACTIVE_ANON]); + __mod_zone_page_state(zone, NR_INACTIVE_ANON, + -count[LRU_INACTIVE_ANON]); - if (scanning_global_lru(sc)) { - nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX, - &page_list, &nr_scan, - sc->order, mode, - zone, 0, file); - zone->pages_scanned += nr_scan; - if (current_is_kswapd()) - __count_zone_vm_events(PGSCAN_KSWAPD, zone, - nr_scan); - else - __count_zone_vm_events(PGSCAN_DIRECT, zone, - nr_scan); - } else { - nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX, - &page_list, &nr_scan, - sc->order, mode, - zone, sc->mem_cgroup, - 0, file); - /* - * mem_cgroup_isolate_pages() keeps track of - * scanned pages on its own. - */ - } + *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; + *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; + __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); - if (nr_taken == 0) - goto done; + reclaim_stat->recent_scanned[0] += *nr_anon; + reclaim_stat->recent_scanned[1] += *nr_file; +} - nr_active = clear_active_flags(&page_list, count); - __count_vm_events(PGDEACTIVATE, nr_active); +/* + * Returns true if the caller should wait to clean dirty/writeback pages. + * + * If we are direct reclaiming for contiguous pages and we do not reclaim + * everything in the list, try again and wait for writeback IO to complete. + * This will stall high-order allocations noticeably. Only do that when really + * need to free the pages under high memory pressure. + */ +static inline bool should_reclaim_stall(unsigned long nr_taken, + unsigned long nr_freed, + int priority, + struct scan_control *sc) +{ + int lumpy_stall_priority; - __mod_zone_page_state(zone, NR_ACTIVE_FILE, - -count[LRU_ACTIVE_FILE]); - __mod_zone_page_state(zone, NR_INACTIVE_FILE, - -count[LRU_INACTIVE_FILE]); - __mod_zone_page_state(zone, NR_ACTIVE_ANON, - -count[LRU_ACTIVE_ANON]); - __mod_zone_page_state(zone, NR_INACTIVE_ANON, - -count[LRU_INACTIVE_ANON]); + /* kswapd should not stall on sync IO */ + if (current_is_kswapd()) + return false; - nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; - nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; - __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); + /* Only stall on lumpy reclaim */ + if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) + return false; - reclaim_stat->recent_scanned[0] += nr_anon; - reclaim_stat->recent_scanned[1] += nr_file; + /* If we have relaimed everything on the isolated list, no stall */ + if (nr_freed == nr_taken) + return false; - spin_unlock_irq(&zone->lru_lock); + /* + * For high-order allocations, there are two stall thresholds. + * High-cost allocations stall immediately where as lower + * order allocations such as stacks require the scanning + * priority to be much higher before stalling. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + lumpy_stall_priority = DEF_PRIORITY; + else + lumpy_stall_priority = DEF_PRIORITY / 3; - nr_scanned += nr_scan; - nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); + return priority <= lumpy_stall_priority; +} - /* - * If we are direct reclaiming for contiguous pages and we do - * not reclaim everything in the list, try again and wait - * for IO to complete. This will stall high-order allocations - * but that should be acceptable to the caller - */ - if (nr_freed < nr_taken && !current_is_kswapd() && - sc->lumpy_reclaim_mode) { - congestion_wait(BLK_RW_ASYNC, HZ/10); +/* + * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * of reclaimed pages + */ +static noinline_for_stack unsigned long +shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, + struct scan_control *sc, int priority, int file) +{ + LIST_HEAD(page_list); + unsigned long nr_scanned; + unsigned long nr_reclaimed = 0; + unsigned long nr_taken; + unsigned long nr_anon; + unsigned long nr_file; - /* - * The attempt at page out may have made some - * of the pages active, mark them inactive again. - */ - nr_active = clear_active_flags(&page_list, count); - count_vm_events(PGDEACTIVATE, nr_active); + while (unlikely(too_many_isolated(zone, file, sc))) { + congestion_wait(BLK_RW_ASYNC, HZ/10); - nr_freed += shrink_page_list(&page_list, sc, - PAGEOUT_IO_SYNC); - } + /* We are about to die and free our memory. Return now. */ + if (fatal_signal_pending(current)) + return SWAP_CLUSTER_MAX; + } - nr_reclaimed += nr_freed; + set_lumpy_reclaim_mode(priority, sc, false); + lru_add_drain(); + spin_lock_irq(&zone->lru_lock); - local_irq_disable(); + if (scanning_global_lru(sc)) { + nr_taken = isolate_pages_global(nr_to_scan, + &page_list, &nr_scanned, sc->order, + sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? + ISOLATE_INACTIVE : ISOLATE_BOTH, + zone, 0, file); + zone->pages_scanned += nr_scanned; if (current_is_kswapd()) - __count_vm_events(KSWAPD_STEAL, nr_freed); - __count_zone_vm_events(PGSTEAL, zone, nr_freed); - - spin_lock(&zone->lru_lock); + __count_zone_vm_events(PGSCAN_KSWAPD, zone, + nr_scanned); + else + __count_zone_vm_events(PGSCAN_DIRECT, zone, + nr_scanned); + } else { + nr_taken = mem_cgroup_isolate_pages(nr_to_scan, + &page_list, &nr_scanned, sc->order, + sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? + ISOLATE_INACTIVE : ISOLATE_BOTH, + zone, sc->mem_cgroup, + 0, file); /* - * Put back any unfreeable pages. + * mem_cgroup_isolate_pages() keeps track of + * scanned pages on its own. */ - while (!list_empty(&page_list)) { - int lru; - page = lru_to_page(&page_list); - VM_BUG_ON(PageLRU(page)); - list_del(&page->lru); - if (unlikely(!page_evictable(page, NULL))) { - spin_unlock_irq(&zone->lru_lock); - putback_lru_page(page); - spin_lock_irq(&zone->lru_lock); - continue; - } - SetPageLRU(page); - lru = page_lru(page); - add_page_to_lru_list(zone, page, lru); - if (is_active_lru(lru)) { - int file = is_file_lru(lru); - reclaim_stat->recent_rotated[file]++; - } - if (!pagevec_add(&pvec, page)) { - spin_unlock_irq(&zone->lru_lock); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); - } - } - __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + } - } while (nr_scanned < max_scan); + if (nr_taken == 0) { + spin_unlock_irq(&zone->lru_lock); + return 0; + } + + update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); -done: spin_unlock_irq(&zone->lru_lock); - pagevec_release(&pvec); - return nr_reclaimed; -} -/* - * We are about to scan this zone at a certain priority level. If that priority - * level is smaller (ie: more urgent) than the previous priority, then note - * that priority level within the zone. This is done so that when the next - * process comes in to scan this zone, it will immediately start out at this - * priority level rather than having to build up its own scanning priority. - * Here, this priority affects only the reclaim-mapped threshold. - */ -static inline void note_zone_scanning_priority(struct zone *zone, int priority) -{ - if (priority < zone->prev_priority) - zone->prev_priority = priority; + nr_reclaimed = shrink_page_list(&page_list, zone, sc); + + /* Check if we should syncronously wait for writeback */ + if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { + set_lumpy_reclaim_mode(priority, sc, true); + nr_reclaimed += shrink_page_list(&page_list, zone, sc); + } + + local_irq_disable(); + if (current_is_kswapd()) + __count_vm_events(KSWAPD_STEAL, nr_reclaimed); + __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); + + putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); + + trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, + zone_idx(zone), + nr_scanned, nr_reclaimed, + priority, + trace_shrink_flags(file, sc->lumpy_reclaim_mode)); + return nr_reclaimed; } /* @@ -1426,6 +1567,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, spin_unlock_irq(&zone->lru_lock); } +#ifdef CONFIG_SWAP static int inactive_anon_is_low_global(struct zone *zone) { unsigned long active, inactive; @@ -1451,12 +1593,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) { int low; + /* + * If we don't have swap space, anonymous page deactivation + * is pointless. + */ + if (!total_swap_pages) + return 0; + if (scanning_global_lru(sc)) low = inactive_anon_is_low_global(zone); else low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); return low; } +#else +static inline int inactive_anon_is_low(struct zone *zone, + struct scan_control *sc) +{ + return 0; +} +#endif static int inactive_file_is_low_global(struct zone *zone) { @@ -1583,6 +1739,13 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, } /* + * With swappiness at 100, anonymous and file have the same priority. + * This scanning priority is essentially the inverse of IO cost. + */ + anon_prio = sc->swappiness; + file_prio = 200 - sc->swappiness; + + /* * OK, so we have swap space and a fair amount of page cache * pages. We use the recently rotated / recently scanned * ratios to determine how valuable each cache is. @@ -1593,28 +1756,18 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * * anon in [0], file in [1] */ + spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { - spin_lock_irq(&zone->lru_lock); reclaim_stat->recent_scanned[0] /= 2; reclaim_stat->recent_rotated[0] /= 2; - spin_unlock_irq(&zone->lru_lock); } if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { - spin_lock_irq(&zone->lru_lock); reclaim_stat->recent_scanned[1] /= 2; reclaim_stat->recent_rotated[1] /= 2; - spin_unlock_irq(&zone->lru_lock); } /* - * With swappiness at 100, anonymous and file have the same priority. - * This scanning priority is essentially the inverse of IO cost. - */ - anon_prio = sc->swappiness; - file_prio = 200 - sc->swappiness; - - /* * The amount of pressure on anon vs file pages is inversely * proportional to the fraction of recently scanned pages on * each list that were recently referenced and in active use. @@ -1624,6 +1777,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; + spin_unlock_irq(&zone->lru_lock); fraction[0] = ap; fraction[1] = fp; @@ -1643,21 +1797,6 @@ out: } } -static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc) -{ - /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - sc->lumpy_reclaim_mode = 1; - else if (sc->order && priority < DEF_PRIORITY - 2) - sc->lumpy_reclaim_mode = 1; - else - sc->lumpy_reclaim_mode = 0; -} - /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -1672,8 +1811,6 @@ static void shrink_zone(int priority, struct zone *zone, get_scan_count(zone, sc, nr, priority); - set_lumpy_reclaim_mode(priority, sc); - while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { @@ -1704,7 +1841,7 @@ static void shrink_zone(int priority, struct zone *zone, * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) + if (inactive_anon_is_low(zone, sc)) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); throttle_vm_writeout(sc->gfp_mask); @@ -1726,16 +1863,14 @@ static void shrink_zone(int priority, struct zone *zone, * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static bool shrink_zones(int priority, struct zonelist *zonelist, +static void shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { - enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); struct zoneref *z; struct zone *zone; - bool all_unreclaimable = true; - for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, - sc->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(sc->gfp_mask), sc->nodemask) { if (!populated_zone(zone)) continue; /* @@ -1745,22 +1880,43 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, if (scanning_global_lru(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - note_zone_scanning_priority(zone, priority); - if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ - } else { - /* - * Ignore cpuset limitation here. We just want to reduce - * # of used pages by us regardless of memory shortage. - */ - mem_cgroup_note_reclaim_priority(sc->mem_cgroup, - priority); } shrink_zone(priority, zone, sc); - all_unreclaimable = false; } +} + +static bool zone_reclaimable(struct zone *zone) +{ + return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; +} + +/* + * As hibernation is going on, kswapd is freezed so that it can't mark + * the zone into all_unreclaimable. It can't handle OOM during hibernation. + * So let's check zone's unreclaimable in direct reclaim as well as kswapd. + */ +static bool all_unreclaimable(struct zonelist *zonelist, + struct scan_control *sc) +{ + struct zoneref *z; + struct zone *zone; + bool all_unreclaimable = true; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(sc->gfp_mask), sc->nodemask) { + if (!populated_zone(zone)) + continue; + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + if (zone_reclaimable(zone)) { + all_unreclaimable = false; + break; + } + } + return all_unreclaimable; } @@ -1784,13 +1940,10 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { int priority; - bool all_unreclaimable; unsigned long total_scanned = 0; struct reclaim_state *reclaim_state = current->reclaim_state; - unsigned long lru_pages = 0; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); unsigned long writeback_threshold; get_mems_allowed(); @@ -1798,29 +1951,26 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (scanning_global_lru(sc)) count_vm_event(ALLOCSTALL); - /* - * mem_cgroup will not do shrink_slab. - */ - if (scanning_global_lru(sc)) { - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - - lru_pages += zone_reclaimable_pages(zone); - } - } for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc->nr_scanned = 0; if (!priority) disable_swap_token(); - all_unreclaimable = shrink_zones(priority, zonelist, sc); + shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups */ if (scanning_global_lru(sc)) { + unsigned long lru_pages = 0; + for_each_zone_zonelist(zone, z, zonelist, + gfp_zone(sc->gfp_mask)) { + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + + lru_pages += zone_reclaimable_pages(zone); + } + shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; @@ -1846,32 +1996,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, /* Take a nap, wait for some writeback to complete */ if (!sc->hibernation_mode && sc->nr_scanned && - priority < DEF_PRIORITY - 2) - congestion_wait(BLK_RW_ASYNC, HZ/10); - } - -out: - /* - * Now that we've scanned all the zones at this priority level, note - * that level within the zone so that the next thread which performs - * scanning of this zone will immediately start out at this priority - * level. This affects only the decision whether or not to bring - * mapped pages onto the inactive list. - */ - if (priority < 0) - priority = 0; - - if (scanning_global_lru(sc)) { - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + priority < DEF_PRIORITY - 2) { + struct zone *preferred_zone; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - - zone->prev_priority = priority; + first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), + NULL, &preferred_zone); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); } - } else - mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); + } +out: delayacct_freepages_end(); put_mems_allowed(); @@ -1879,7 +2013,7 @@ out: return sc->nr_reclaimed; /* top priority shrink_zones still had more to do? don't OOM, then */ - if (scanning_global_lru(sc) && !all_unreclaimable) + if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) return 1; return 0; @@ -1888,6 +2022,7 @@ out: unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { + unsigned long nr_reclaimed; struct scan_control sc = { .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, @@ -1900,7 +2035,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .nodemask = nodemask, }; - return do_try_to_free_pages(zonelist, &sc); + trace_mm_vmscan_direct_reclaim_begin(order, + sc.may_writepage, + gfp_mask); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); + + return nr_reclaimed; } #ifdef CONFIG_CGROUP_MEM_RES_CTLR @@ -1908,9 +2051,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, unsigned int swappiness, - struct zone *zone, int nid) + struct zone *zone) { struct scan_control sc = { + .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, @@ -1918,13 +2062,13 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .order = 0, .mem_cgroup = mem, }; - nodemask_t nm = nodemask_of_node(nid); - sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); - sc.nodemask = &nm; - sc.nr_reclaimed = 0; - sc.nr_scanned = 0; + + trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, + sc.may_writepage, + sc.gfp_mask); + /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. @@ -1933,6 +2077,9 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * the priority and make it zero. */ shrink_zone(0, zone, &sc); + + trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + return sc.nr_reclaimed; } @@ -1942,6 +2089,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, unsigned int swappiness) { struct zonelist *zonelist; + unsigned long nr_reclaimed; struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, @@ -1956,7 +2104,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); zonelist = NODE_DATA(numa_node_id())->node_zonelists; - return do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_memcg_reclaim_begin(0, + sc.may_writepage, + sc.gfp_mask); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); + + return nr_reclaimed; } #endif @@ -2028,22 +2185,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .order = order, .mem_cgroup = NULL, }; - /* - * temp_priority is used to remember the scanning priority at which - * this zone was successfully refilled to - * free_pages == high_wmark_pages(zone). - */ - int temp_priority[MAX_NR_ZONES]; - loop_again: total_scanned = 0; sc.nr_reclaimed = 0; sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); - for (i = 0; i < pgdat->nr_zones; i++) - temp_priority[i] = DEF_PRIORITY; - for (priority = DEF_PRIORITY; priority >= 0; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; @@ -2103,7 +2250,6 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; int nr_slab; - int nid, zid; if (!populated_zone(zone)) continue; @@ -2111,18 +2257,14 @@ loop_again: if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; - temp_priority[i] = priority; sc.nr_scanned = 0; - note_zone_scanning_priority(zone, priority); - nid = pgdat->node_id; - zid = zone_idx(zone); /* * Call soft limit reclaim before calling shrink_zone. * For now we ignore the return value */ - mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask, - nid, zid); + mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); + /* * We put equal pressure on every zone, unless one * zone has way too many pages free already. @@ -2137,8 +2279,7 @@ loop_again: total_scanned += sc.nr_scanned; if (zone->all_unreclaimable) continue; - if (nr_slab == 0 && - zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6)) + if (nr_slab == 0 && !zone_reclaimable(zone)) zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and @@ -2160,6 +2301,15 @@ loop_again: if (!zone_watermark_ok(zone, order, min_wmark_pages(zone), end_zone, 0)) has_under_min_watermark_zone = 1; + } else { + /* + * If a zone reaches its high watermark, + * consider it to be no longer congested. It's + * possible there are dirty pages backed by + * congested BDIs but as pressure is relieved, + * spectulatively avoid congestion waits + */ + zone_clear_flag(zone, ZONE_CONGESTED); } } @@ -2186,16 +2336,6 @@ loop_again: break; } out: - /* - * Note within each zone the priority level at which this zone was - * brought into a happy state. So that the next thread which scans this - * zone will start out at that priority level. - */ - for (i = 0; i < pgdat->nr_zones; i++) { - struct zone *zone = pgdat->node_zones + i; - - zone->prev_priority = temp_priority[i]; - } if (!all_zones_ok) { cond_resched(); @@ -2299,9 +2439,10 @@ static int kswapd(void *p) * premature sleep. If not, then go fully * to sleep until explicitly woken up */ - if (!sleeping_prematurely(pgdat, order, remaining)) + if (!sleeping_prematurely(pgdat, order, remaining)) { + trace_mm_vmscan_kswapd_sleep(pgdat->node_id); schedule(); - else { + } else { if (remaining) count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); else @@ -2321,8 +2462,10 @@ static int kswapd(void *p) * We can speed up thawing tasks if we don't call balance_pgdat * after returning from the refrigerator */ - if (!ret) + if (!ret) { + trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); balance_pgdat(pgdat, order); + } } return 0; } @@ -2342,6 +2485,7 @@ void wakeup_kswapd(struct zone *zone, int order) return; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; if (!waitqueue_active(&pgdat->kswapd_wait)) @@ -2590,9 +2734,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .swappiness = vm_swappiness, .order = order, }; - unsigned long slab_reclaimable; + unsigned long nr_slab_pages0, nr_slab_pages1; - disable_swap_token(); cond_resched(); /* * We need to be able to allocate from the reserves for RECLAIM_SWAP @@ -2611,14 +2754,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ priority = ZONE_RECLAIM_PRIORITY; do { - note_zone_scanning_priority(zone, priority); shrink_zone(priority, zone, &sc); priority--; } while (priority >= 0 && sc.nr_reclaimed < nr_pages); } - slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); - if (slab_reclaimable > zone->min_slab_pages) { + nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); + if (nr_slab_pages0 > zone->min_slab_pages) { /* * shrink_slab() does not currently allow us to determine how * many pages were freed in this zone. So we take the current @@ -2629,17 +2771,27 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Note that shrink_slab will free memory on all zones and may * take a long time. */ - while (shrink_slab(sc.nr_scanned, gfp_mask, order) && - zone_page_state(zone, NR_SLAB_RECLAIMABLE) > - slab_reclaimable - nr_pages) - ; + for (;;) { + unsigned long lru_pages = zone_reclaimable_pages(zone); + + /* No reclaimable slab or very low memory pressure */ + if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages)) + break; + + /* Freed enough memory */ + nr_slab_pages1 = zone_page_state(zone, + NR_SLAB_RECLAIMABLE); + if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) + break; + } /* * Update nr_reclaimed by the number of slab pages we * reclaimed from this zone. */ - sc.nr_reclaimed += slab_reclaimable - - zone_page_state(zone, NR_SLAB_RECLAIMABLE); + nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); + if (nr_slab_pages1 < nr_slab_pages0) + sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; } p->reclaim_state = NULL; @@ -2898,6 +3050,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write, return 0; } +#ifdef CONFIG_NUMA /* * per node 'scan_unevictable_pages' attribute. On demand re-scan of * a specified node's per zone unevictable lists for evictable pages. @@ -2944,4 +3097,4 @@ void scan_unevictable_unregister_node(struct node *node) { sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); } - +#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 7759941d4e77..cd2e42be7b68 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -17,19 +17,21 @@ #include <linux/vmstat.h> #include <linux/sched.h> #include <linux/math64.h> +#include <linux/writeback.h> +#include <linux/compaction.h> #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); -static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask) +static void sum_vm_events(unsigned long *ret) { int cpu; int i; memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); - for_each_cpu(cpu, cpumask) { + for_each_online_cpu(cpu) { struct vm_event_state *this = &per_cpu(vm_event_states, cpu); for (i = 0; i < NR_VM_EVENT_ITEMS; i++) @@ -45,7 +47,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask) void all_vm_events(unsigned long *ret) { get_online_cpus(); - sum_vm_events(ret, cpu_online_mask); + sum_vm_events(ret); put_online_cpus(); } EXPORT_SYMBOL_GPL(all_vm_events); @@ -138,11 +140,24 @@ static void refresh_zone_stat_thresholds(void) int threshold; for_each_populated_zone(zone) { + unsigned long max_drift, tolerate_drift; + threshold = calculate_threshold(zone); for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; + + /* + * Only set percpu_drift_mark if there is a danger that + * NR_FREE_PAGES reports the low watermark is ok when in fact + * the min watermark could be breached by an allocation + */ + tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone); + max_drift = num_online_cpus() * threshold; + if (max_drift > tolerate_drift) + zone->percpu_drift_mark = high_wmark_pages(zone) + + max_drift; } } @@ -381,6 +396,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z) #endif #ifdef CONFIG_COMPACTION + struct contig_page_info { unsigned long free_pages; unsigned long free_blocks_total; @@ -732,6 +748,11 @@ static const char * const vmstat_text[] = { "nr_isolated_anon", "nr_isolated_file", "nr_shmem", + "nr_dirtied", + "nr_written", + "nr_dirty_threshold", + "nr_dirty_background_threshold", + #ifdef CONFIG_NUMA "numa_hit", "numa_miss", @@ -813,7 +834,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n scanned %lu" "\n spanned %lu" "\n present %lu", - zone_page_state(zone, NR_FREE_PAGES), + zone_nr_free_pages(zone), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), @@ -853,11 +874,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, } seq_printf(m, "\n all_unreclaimable: %u" - "\n prev_priority: %i" "\n start_pfn: %lu" "\n inactive_ratio: %u", zone->all_unreclaimable, - zone->prev_priority, zone->zone_start_pfn, zone->inactive_ratio); seq_putc(m, '\n'); @@ -893,36 +912,44 @@ static const struct file_operations proc_zoneinfo_file_operations = { .release = seq_release, }; +enum writeback_stat_item { + NR_DIRTY_THRESHOLD, + NR_DIRTY_BG_THRESHOLD, + NR_VM_WRITEBACK_STAT_ITEMS, +}; + static void *vmstat_start(struct seq_file *m, loff_t *pos) { unsigned long *v; -#ifdef CONFIG_VM_EVENT_COUNTERS - unsigned long *e; -#endif - int i; + int i, stat_items_size; if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; + stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + + NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); #ifdef CONFIG_VM_EVENT_COUNTERS - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) - + sizeof(struct vm_event_state), GFP_KERNEL); -#else - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), - GFP_KERNEL); + stat_items_size += sizeof(struct vm_event_state); #endif + + v = kmalloc(stat_items_size, GFP_KERNEL); m->private = v; if (!v) return ERR_PTR(-ENOMEM); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) v[i] = global_page_state(i); + v += NR_VM_ZONE_STAT_ITEMS; + + global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, + v + NR_DIRTY_THRESHOLD); + v += NR_VM_WRITEBACK_STAT_ITEMS; + #ifdef CONFIG_VM_EVENT_COUNTERS - e = v + NR_VM_ZONE_STAT_ITEMS; - all_vm_events(e); - e[PGPGIN] /= 2; /* sectors -> kbytes */ - e[PGPGOUT] /= 2; + all_vm_events(v); + v[PGPGIN] /= 2; /* sectors -> kbytes */ + v[PGPGOUT] /= 2; #endif - return v + *pos; + return m->private + *pos; } static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) @@ -1000,6 +1027,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: + refresh_zone_stat_thresholds(); start_cpu_timer(cpu); node_set_state(cpu_to_node(cpu), N_CPU); break; |