diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/backing-dev.c | 259 | ||||
| -rw-r--r-- | mm/bounce.c | 75 | ||||
| -rw-r--r-- | mm/memcontrol.c | 36 | ||||
| -rw-r--r-- | mm/mmap.c | 11 | ||||
| -rw-r--r-- | mm/mmu_context.c | 3 | ||||
| -rw-r--r-- | mm/page_io.c | 2 | ||||
| -rw-r--r-- | mm/shmem.c | 1 | ||||
| -rw-r--r-- | mm/slab.c | 790 | ||||
| -rw-r--r-- | mm/slab.h | 43 | ||||
| -rw-r--r-- | mm/slab_common.c | 174 | ||||
| -rw-r--r-- | mm/slub.c | 221 | ||||
| -rw-r--r-- | mm/swap.c | 1 | ||||
| -rw-r--r-- | mm/vmalloc.c | 2 |
13 files changed, 679 insertions, 939 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 41733c5dc820..502517492258 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -31,13 +31,14 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; /* - * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as - * reader side protection for bdi_pending_list. bdi_list has RCU reader side + * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side * locking. */ DEFINE_SPINLOCK(bdi_lock); LIST_HEAD(bdi_list); -LIST_HEAD(bdi_pending_list); + +/* bdi_wq serves all asynchronous writeback tasks */ +struct workqueue_struct *bdi_wq; void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) { @@ -257,6 +258,11 @@ static int __init default_bdi_init(void) { int err; + bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE | + WQ_UNBOUND | WQ_SYSFS, 0); + if (!bdi_wq) + return -ENOMEM; + err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); @@ -271,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) return wb_has_dirty_io(&bdi->wb); } -static void wakeup_timer_fn(unsigned long data) -{ - struct backing_dev_info *bdi = (struct backing_dev_info *)data; - - spin_lock_bh(&bdi->wb_lock); - if (bdi->wb.task) { - trace_writeback_wake_thread(bdi); - wake_up_process(bdi->wb.task); - } else if (bdi->dev) { - /* - * When bdi tasks are inactive for long time, they are killed. - * In this case we have to wake-up the forker thread which - * should create and run the bdi thread. - */ - trace_writeback_wake_forker_thread(bdi); - wake_up_process(default_backing_dev_info.wb.task); - } - spin_unlock_bh(&bdi->wb_lock); -} - /* * This function is used when the first inode for this bdi is marked dirty. It * wakes-up the corresponding bdi thread which should then take care of the @@ -307,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); -} - -/* - * Calculate the longest interval (jiffies) bdi threads are allowed to be - * inactive. - */ -static unsigned long bdi_longest_inactive(void) -{ - unsigned long interval; - - interval = msecs_to_jiffies(dirty_writeback_interval * 10); - return max(5UL * 60 * HZ, interval); -} - -/* - * Clear pending bit and wakeup anybody waiting for flusher thread creation or - * shutdown - */ -static void bdi_clear_pending(struct backing_dev_info *bdi) -{ - clear_bit(BDI_pending, &bdi->state); - smp_mb__after_clear_bit(); - wake_up_bit(&bdi->state, BDI_pending); -} - -static int bdi_forker_thread(void *ptr) -{ - struct bdi_writeback *me = ptr; - - current->flags |= PF_SWAPWRITE; - set_freezable(); - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(current, 0); - - for (;;) { - struct task_struct *task = NULL; - struct backing_dev_info *bdi; - enum { - NO_ACTION, /* Nothing to do */ - FORK_THREAD, /* Fork bdi thread */ - KILL_THREAD, /* Kill inactive bdi thread */ - } action = NO_ACTION; - - /* - * Temporary measure, we want to make sure we don't see - * dirty data on the default backing_dev_info - */ - if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) { - del_timer(&me->wakeup_timer); - wb_do_writeback(me, 0); - } - - spin_lock_bh(&bdi_lock); - /* - * In the following loop we are going to check whether we have - * some work to do without any synchronization with tasks - * waking us up to do work for them. Set the task state here - * so that we don't miss wakeups after verifying conditions. - */ - set_current_state(TASK_INTERRUPTIBLE); - - list_for_each_entry(bdi, &bdi_list, bdi_list) { - bool have_dirty_io; - - if (!bdi_cap_writeback_dirty(bdi) || - bdi_cap_flush_forker(bdi)) - continue; - - WARN(!test_bit(BDI_registered, &bdi->state), - "bdi %p/%s is not registered!\n", bdi, bdi->name); - - have_dirty_io = !list_empty(&bdi->work_list) || - wb_has_dirty_io(&bdi->wb); - - /* - * If the bdi has work to do, but the thread does not - * exist - create it. - */ - if (!bdi->wb.task && have_dirty_io) { - /* - * Set the pending bit - if someone will try to - * unregister this bdi - it'll wait on this bit. - */ - set_bit(BDI_pending, &bdi->state); - action = FORK_THREAD; - break; - } - - spin_lock(&bdi->wb_lock); - - /* - * If there is no work to do and the bdi thread was - * inactive long enough - kill it. The wb_lock is taken - * to make sure no-one adds more work to this bdi and - * wakes the bdi thread up. - */ - if (bdi->wb.task && !have_dirty_io && - time_after(jiffies, bdi->wb.last_active + - bdi_longest_inactive())) { - task = bdi->wb.task; - bdi->wb.task = NULL; - spin_unlock(&bdi->wb_lock); - set_bit(BDI_pending, &bdi->state); - action = KILL_THREAD; - break; - } - spin_unlock(&bdi->wb_lock); - } - spin_unlock_bh(&bdi_lock); - - /* Keep working if default bdi still has things to do */ - if (!list_empty(&me->bdi->work_list)) - __set_current_state(TASK_RUNNING); - - switch (action) { - case FORK_THREAD: - __set_current_state(TASK_RUNNING); - task = kthread_create(bdi_writeback_thread, &bdi->wb, - "flush-%s", dev_name(bdi->dev)); - if (IS_ERR(task)) { - /* - * If thread creation fails, force writeout of - * the bdi from the thread. Hopefully 1024 is - * large enough for efficient IO. - */ - writeback_inodes_wb(&bdi->wb, 1024, - WB_REASON_FORKER_THREAD); - } else { - /* - * The spinlock makes sure we do not lose - * wake-ups when racing with 'bdi_queue_work()'. - * And as soon as the bdi thread is visible, we - * can start it. - */ - spin_lock_bh(&bdi->wb_lock); - bdi->wb.task = task; - spin_unlock_bh(&bdi->wb_lock); - wake_up_process(task); - } - bdi_clear_pending(bdi); - break; - - case KILL_THREAD: - __set_current_state(TASK_RUNNING); - kthread_stop(task); - bdi_clear_pending(bdi); - break; - - case NO_ACTION: - if (!wb_has_dirty_io(me) || !dirty_writeback_interval) - /* - * There are no dirty data. The only thing we - * should now care about is checking for - * inactive bdi threads and killing them. Thus, - * let's sleep for longer time, save energy and - * be friendly for battery-driven devices. - */ - schedule_timeout(bdi_longest_inactive()); - else - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); - try_to_freeze(); - break; - } - } - - return 0; + mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); } /* @@ -489,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) spin_unlock_bh(&bdi_lock); synchronize_rcu_expedited(); + + /* bdi_list is now unused, clear it to mark @bdi dying */ + INIT_LIST_HEAD(&bdi->bdi_list); } int bdi_register(struct backing_dev_info *bdi, struct device *parent, @@ -508,20 +328,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, bdi->dev = dev; - /* - * Just start the forker thread for our default backing_dev_info, - * and add other bdi's to the list. They will get a thread created - * on-demand when they need it. - */ - if (bdi_cap_flush_forker(bdi)) { - struct bdi_writeback *wb = &bdi->wb; - - wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s", - dev_name(dev)); - if (IS_ERR(wb->task)) - return PTR_ERR(wb->task); - } - bdi_debug_register(bdi, dev_name(dev)); set_bit(BDI_registered, &bdi->state); @@ -545,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { - struct task_struct *task; - if (!bdi_cap_writeback_dirty(bdi)) return; @@ -556,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) bdi_remove_from_list(bdi); /* - * If setup is pending, wait for that to complete first + * Drain work list and shutdown the delayed_work. At this point, + * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi + * is dying and its work_list needs to be drained no matter what. */ - wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, - TASK_UNINTERRUPTIBLE); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + flush_delayed_work(&bdi->wb.dwork); + WARN_ON(!list_empty(&bdi->work_list)); /* - * Finally, kill the kernel thread. We don't need to be RCU - * safe anymore, since the bdi is gone from visibility. + * This shouldn't be necessary unless @bdi for some reason has + * unflushed dirty IO after work_list is drained. Do it anyway + * just in case. */ - spin_lock_bh(&bdi->wb_lock); - task = bdi->wb.task; - bdi->wb.task = NULL; - spin_unlock_bh(&bdi->wb_lock); - - if (task) - kthread_stop(task); + cancel_delayed_work_sync(&bdi->wb.dwork); } /* @@ -597,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi) bdi_set_min_ratio(bdi, 0); trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); - del_timer_sync(&bdi->wb.wakeup_timer); - if (!bdi_cap_flush_forker(bdi)) - bdi_wb_shutdown(bdi); + bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); spin_lock_bh(&bdi->wb_lock); @@ -622,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); spin_lock_init(&wb->list_lock); - setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); + INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); } /* @@ -695,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi_unregister(bdi); /* - * If bdi_unregister() had already been called earlier, the - * wakeup_timer could still be armed because bdi_prune_sb() - * can race with the bdi_wakeup_thread_delayed() calls from - * __mark_inode_dirty(). + * If bdi_unregister() had already been called earlier, the dwork + * could still be pending because bdi_prune_sb() can race with the + * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty(). */ - del_timer_sync(&bdi->wb.wakeup_timer); + cancel_delayed_work_sync(&bdi->wb.dwork); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); diff --git a/mm/bounce.c b/mm/bounce.c index a5c2ec3589cb..c9f0a4339a7d 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -101,7 +101,7 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) struct bio_vec *tovec, *fromvec; int i; - __bio_for_each_segment(tovec, to, i, 0) { + bio_for_each_segment(tovec, to, i) { fromvec = from->bi_io_vec + i; /* @@ -134,7 +134,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) /* * free up bounce indirect pages used */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { org_vec = bio_orig->bi_io_vec + i; if (bvec->bv_page == org_vec->bv_page) continue; @@ -199,78 +199,43 @@ static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, mempool_t *pool, int force) { - struct page *page; - struct bio *bio = NULL; - int i, rw = bio_data_dir(*bio_orig); + struct bio *bio; + int rw = bio_data_dir(*bio_orig); struct bio_vec *to, *from; + unsigned i; - bio_for_each_segment(from, *bio_orig, i) { - page = from->bv_page; + bio_for_each_segment(from, *bio_orig, i) + if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q)) + goto bounce; - /* - * is destination page below bounce pfn? - */ - if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) - continue; - - /* - * irk, bounce it - */ - if (!bio) { - unsigned int cnt = (*bio_orig)->bi_vcnt; + return; +bounce: + bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set); - bio = bio_alloc(GFP_NOIO, cnt); - memset(bio->bi_io_vec, 0, cnt * sizeof(struct bio_vec)); - } - + bio_for_each_segment_all(to, bio, i) { + struct page *page = to->bv_page; - to = bio->bi_io_vec + i; + if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) + continue; - to->bv_page = mempool_alloc(pool, q->bounce_gfp); - to->bv_len = from->bv_len; - to->bv_offset = from->bv_offset; inc_zone_page_state(to->bv_page, NR_BOUNCE); + to->bv_page = mempool_alloc(pool, q->bounce_gfp); if (rw == WRITE) { char *vto, *vfrom; - flush_dcache_page(from->bv_page); + flush_dcache_page(page); + vto = page_address(to->bv_page) + to->bv_offset; - vfrom = kmap(from->bv_page) + from->bv_offset; + vfrom = kmap_atomic(page) + to->bv_offset; memcpy(vto, vfrom, to->bv_len); - kunmap(from->bv_page); + kunmap_atomic(vfrom); } } - /* - * no pages bounced - */ - if (!bio) - return; - trace_block_bio_bounce(q, *bio_orig); - /* - * at least one page was bounced, fill in possible non-highmem - * pages - */ - __bio_for_each_segment(from, *bio_orig, i, 0) { - to = bio_iovec_idx(bio, i); - if (!to->bv_page) { - to->bv_page = from->bv_page; - to->bv_len = from->bv_len; - to->bv_offset = from->bv_offset; - } - } - - bio->bi_bdev = (*bio_orig)->bi_bdev; bio->bi_flags |= (1 << BIO_BOUNCED); - bio->bi_sector = (*bio_orig)->bi_sector; - bio->bi_rw = (*bio_orig)->bi_rw; - - bio->bi_vcnt = (*bio_orig)->bi_vcnt; - bio->bi_idx = (*bio_orig)->bi_idx; - bio->bi_size = (*bio_orig)->bi_size; if (pool == page_pool) { bio->bi_end_io = bounce_end_io_write; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0f1d92163f30..cb1c9dedf9b6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -92,16 +92,18 @@ enum mem_cgroup_stat_index { /* * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. */ - MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ - MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ - MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ - MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ + MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ + MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ + MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ + MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ + MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, }; static const char * const mem_cgroup_stat_names[] = { "cache", "rss", + "rss_huge", "mapped_file", "swap", }; @@ -917,6 +919,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, + struct page *page, bool anon, int nr_pages) { preempt_disable(); @@ -932,6 +935,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); + if (PageTransHuge(page)) + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], + nr_pages); + /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); @@ -2914,7 +2921,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, else anon = false; - mem_cgroup_charge_statistics(memcg, anon, nr_pages); + mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); unlock_page_cgroup(pc); /* @@ -3708,16 +3715,21 @@ void mem_cgroup_split_huge_fixup(struct page *head) { struct page_cgroup *head_pc = lookup_page_cgroup(head); struct page_cgroup *pc; + struct mem_cgroup *memcg; int i; if (mem_cgroup_disabled()) return; + + memcg = head_pc->mem_cgroup; for (i = 1; i < HPAGE_PMD_NR; i++) { pc = head_pc + i; - pc->mem_cgroup = head_pc->mem_cgroup; + pc->mem_cgroup = memcg; smp_wmb();/* see __commit_charge() */ pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; } + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], + HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -3773,11 +3785,11 @@ static int mem_cgroup_move_account(struct page *page, __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); preempt_enable(); } - mem_cgroup_charge_statistics(from, anon, -nr_pages); + mem_cgroup_charge_statistics(from, page, anon, -nr_pages); /* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, anon, nr_pages); + mem_cgroup_charge_statistics(to, page, anon, nr_pages); move_unlock_mem_cgroup(from, &flags); ret = 0; unlock: @@ -4152,7 +4164,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, break; } - mem_cgroup_charge_statistics(memcg, anon, -nr_pages); + mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); ClearPageCgroupUsed(pc); /* @@ -4502,7 +4514,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { memcg = pc->mem_cgroup; - mem_cgroup_charge_statistics(memcg, false, -1); + mem_cgroup_charge_statistics(memcg, oldpage, false, -1); ClearPageCgroupUsed(pc); } unlock_page_cgroup(pc); @@ -5030,6 +5042,10 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) return res_counter_read_u64(&memcg->memsw, RES_USAGE); } + /* + * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS + * as well as in MEM_CGROUP_STAT_RSS_HUGE. + */ val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); diff --git a/mm/mmap.c b/mm/mmap.c index da3e9c04bf37..f681e1842fad 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1363,15 +1363,24 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, file = fget(fd); if (!file) goto out; + if (is_file_hugepages(file)) + len = ALIGN(len, huge_page_size(hstate_file(file))); } else if (flags & MAP_HUGETLB) { struct user_struct *user = NULL; + struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & + SHM_HUGE_MASK); + + if (!hs) + return -EINVAL; + + len = ALIGN(len, huge_page_size(hs)); /* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called * A dummy user value is used because we are not locking * memory so no accounting is necessary */ - file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, + file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, &user, HUGETLB_ANONHUGE_INODE, (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); diff --git a/mm/mmu_context.c b/mm/mmu_context.c index 3dcfaf4ed355..8a8cd0265e52 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -14,9 +14,6 @@ * use_mm * Makes the calling kernel thread take on the specified * mm context. - * Called by the retry thread execute retries within the - * iocb issuer's mm context, so that copy_from/to_user - * operations work seamlessly for aio. * (Note: this routine is intended to be called only * from a kernel thread context) */ diff --git a/mm/page_io.c b/mm/page_io.c index bb5d75274686..a8a3ef45fed7 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -20,6 +20,7 @@ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/frontswap.h> +#include <linux/aio.h> #include <asm/pgtable.h> static struct bio *get_swap_bio(gfp_t gfp_flags, @@ -35,7 +36,6 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, bio->bi_io_vec[0].bv_len = PAGE_SIZE; bio->bi_io_vec[0].bv_offset = 0; bio->bi_vcnt = 1; - bio->bi_idx = 0; bio->bi_size = PAGE_SIZE; bio->bi_end_io = end_io; } diff --git a/mm/shmem.c b/mm/shmem.c index 39b2a0b86fe8..5e6a8422658b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -31,6 +31,7 @@ #include <linux/mm.h> #include <linux/export.h> #include <linux/swap.h> +#include <linux/aio.h> static struct vfsmount *shm_mnt; diff --git a/mm/slab.c b/mm/slab.c index 96079244c860..8ccd296c6d9c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -286,68 +286,27 @@ struct arraycache_init { }; /* - * The slab lists for all objects. - */ -struct kmem_list3 { - struct list_head slabs_partial; /* partial list first, better asm code */ - struct list_head slabs_full; - struct list_head slabs_free; - unsigned long free_objects; - unsigned int free_limit; - unsigned int colour_next; /* Per-node cache coloring */ - spinlock_t list_lock; - struct array_cache *shared; /* shared per node */ - struct array_cache **alien; /* on other nodes */ - unsigned long next_reap; /* updated without locking */ - int free_touched; /* updated without locking */ -}; - -/* * Need this for bootstrapping a per node allocator. */ #define NUM_INIT_LISTS (3 * MAX_NUMNODES) -static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; +static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; #define CACHE_CACHE 0 #define SIZE_AC MAX_NUMNODES -#define SIZE_L3 (2 * MAX_NUMNODES) +#define SIZE_NODE (2 * MAX_NUMNODES) static int drain_freelist(struct kmem_cache *cache, - struct kmem_list3 *l3, int tofree); + struct kmem_cache_node *n, int tofree); static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node); static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); static void cache_reap(struct work_struct *unused); -/* - * This function must be completely optimized away if a constant is passed to - * it. Mostly the same as what is in linux/slab.h except it returns an index. - */ -static __always_inline int index_of(const size_t size) -{ - extern void __bad_size(void); - - if (__builtin_constant_p(size)) { - int i = 0; - -#define CACHE(x) \ - if (size <=x) \ - return i; \ - else \ - i++; -#include <linux/kmalloc_sizes.h> -#undef CACHE - __bad_size(); - } else - __bad_size(); - return 0; -} - static int slab_early_init = 1; -#define INDEX_AC index_of(sizeof(struct arraycache_init)) -#define INDEX_L3 index_of(sizeof(struct kmem_list3)) +#define INDEX_AC kmalloc_index(sizeof(struct arraycache_init)) +#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) -static void kmem_list3_init(struct kmem_list3 *parent) +static void kmem_cache_node_init(struct kmem_cache_node *parent) { INIT_LIST_HEAD(&parent->slabs_full); INIT_LIST_HEAD(&parent->slabs_partial); @@ -363,7 +322,7 @@ static void kmem_list3_init(struct kmem_list3 *parent) #define MAKE_LIST(cachep, listp, slab, nodeid) \ do { \ INIT_LIST_HEAD(listp); \ - list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ + list_splice(&(cachep->node[nodeid]->slab), listp); \ } while (0) #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ @@ -524,30 +483,6 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache, return reciprocal_divide(offset, cache->reciprocal_buffer_size); } -/* - * These are the default caches for kmalloc. Custom caches can have other sizes. - */ -struct cache_sizes malloc_sizes[] = { -#define CACHE(x) { .cs_size = (x) }, -#include <linux/kmalloc_sizes.h> - CACHE(ULONG_MAX) -#undef CACHE -}; -EXPORT_SYMBOL(malloc_sizes); - -/* Must match cache_sizes above. Out of line to keep cache footprint low. */ -struct cache_names { - char *name; - char *name_dma; -}; - -static struct cache_names __initdata cache_names[] = { -#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, -#include <linux/kmalloc_sizes.h> - {NULL,} -#undef CACHE -}; - static struct arraycache_init initarray_generic = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; @@ -586,15 +521,15 @@ static void slab_set_lock_classes(struct kmem_cache *cachep, int q) { struct array_cache **alc; - struct kmem_list3 *l3; + struct kmem_cache_node *n; int r; - l3 = cachep->nodelists[q]; - if (!l3) + n = cachep->node[q]; + if (!n) return; - lockdep_set_class(&l3->list_lock, l3_key); - alc = l3->alien; + lockdep_set_class(&n->list_lock, l3_key); + alc = n->alien; /* * FIXME: This check for BAD_ALIEN_MAGIC * should go away when common slab code is taught to @@ -625,28 +560,30 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) static void init_node_lock_keys(int q) { - struct cache_sizes *s = malloc_sizes; + int i; if (slab_state < UP) return; - for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { - struct kmem_list3 *l3; + for (i = 1; i < PAGE_SHIFT + MAX_ORDER; i++) { + struct kmem_cache_node *n; + struct kmem_cache *cache = kmalloc_caches[i]; + + if (!cache) + continue; - l3 = s->cs_cachep->nodelists[q]; - if (!l3 || OFF_SLAB(s->cs_cachep)) + n = cache->node[q]; + if (!n || OFF_SLAB(cache)) continue; - slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key, + slab_set_lock_classes(cache, &on_slab_l3_key, &on_slab_alc_key, q); } } static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) { - struct kmem_list3 *l3; - l3 = cachep->nodelists[q]; - if (!l3) + if (!cachep->node[q]) return; slab_set_lock_classes(cachep, &on_slab_l3_key, @@ -702,41 +639,6 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) return cachep->array[smp_processor_id()]; } -static inline struct kmem_cache *__find_general_cachep(size_t size, - gfp_t gfpflags) -{ - struct cache_sizes *csizep = malloc_sizes; - -#if DEBUG - /* This happens if someone tries to call - * kmem_cache_create(), or __kmalloc(), before - * the generic caches are initialized. - */ - BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); -#endif - if (!size) - return ZERO_SIZE_PTR; - - while (size > csizep->cs_size) - csizep++; - - /* - * Really subtle: The last entry with cs->cs_size==ULONG_MAX - * has cs_{dma,}cachep==NULL. Thus no special case - * for large kmalloc calls required. - */ -#ifdef CONFIG_ZONE_DMA - if (unlikely(gfpflags & GFP_DMA)) - return csizep->cs_dmacachep; -#endif - return csizep->cs_cachep; -} - -static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) -{ - return __find_general_cachep(size, gfpflags); -} - static size_t slab_mgmt_size(size_t nr_objs, size_t align) { return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); @@ -938,29 +840,29 @@ static inline bool is_slab_pfmemalloc(struct slab *slabp) static void recheck_pfmemalloc_active(struct kmem_cache *cachep, struct array_cache *ac) { - struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()]; + struct kmem_cache_node *n = cachep->node[numa_mem_id()]; struct slab *slabp; unsigned long flags; if (!pfmemalloc_active) return; - spin_lock_irqsave(&l3->list_lock, flags); - list_for_each_entry(slabp, &l3->slabs_full, list) + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(slabp, &n->slabs_full, list) if (is_slab_pfmemalloc(slabp)) goto out; - list_for_each_entry(slabp, &l3->slabs_partial, list) + list_for_each_entry(slabp, &n->slabs_partial, list) if (is_slab_pfmemalloc(slabp)) goto out; - list_for_each_entry(slabp, &l3->slabs_free, list) + list_for_each_entry(slabp, &n->slabs_free, list) if (is_slab_pfmemalloc(slabp)) goto out; pfmemalloc_active = false; out: - spin_unlock_irqrestore(&l3->list_lock, flags); + spin_unlock_irqrestore(&n->list_lock, flags); } static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, @@ -971,7 +873,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ if (unlikely(is_obj_pfmemalloc(objp))) { - struct kmem_list3 *l3; + struct kmem_cache_node *n; if (gfp_pfmemalloc_allowed(flags)) { clear_obj_pfmemalloc(&objp); @@ -993,8 +895,8 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, * If there are empty slabs on the slabs_free list and we are * being forced to refill the cache, mark this one !pfmemalloc. */ - l3 = cachep->nodelists[numa_mem_id()]; - if (!list_empty(&l3->slabs_free) && force_refill) { + n = cachep->node[numa_mem_id()]; + if (!list_empty(&n->slabs_free) && force_refill) { struct slab *slabp = virt_to_slab(objp); ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem)); clear_obj_pfmemalloc(&objp); @@ -1071,7 +973,7 @@ static int transfer_objects(struct array_cache *to, #ifndef CONFIG_NUMA #define drain_alien_cache(cachep, alien) do { } while (0) -#define reap_alien(cachep, l3) do { } while (0) +#define reap_alien(cachep, n) do { } while (0) static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { @@ -1143,33 +1045,33 @@ static void free_alien_cache(struct array_cache **ac_ptr) static void __drain_alien_cache(struct kmem_cache *cachep, struct array_cache *ac, int node) { - struct kmem_list3 *rl3 = cachep->nodelists[node]; + struct kmem_cache_node *n = cachep->node[node]; if (ac->avail) { - spin_lock(&rl3->list_lock); + spin_lock(&n->list_lock); /* * Stuff objects into the remote nodes shared array first. * That way we could avoid the overhead of putting the objects * into the free lists and getting them back later. */ - if (rl3->shared) - transfer_objects(rl3->shared, ac, ac->limit); + if (n->shared) + transfer_objects(n->shared, ac, ac->limit); free_block(cachep, ac->entry, ac->avail, node); ac->avail = 0; - spin_unlock(&rl3->list_lock); + spin_unlock(&n->list_lock); } } /* * Called from cache_reap() to regularly drain alien caches round robin. */ -static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) +static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) { int node = __this_cpu_read(slab_reap_node); - if (l3->alien) { - struct array_cache *ac = l3->alien[node]; + if (n->alien) { + struct array_cache *ac = n->alien[node]; if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { __drain_alien_cache(cachep, ac, node); @@ -1199,7 +1101,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) { struct slab *slabp = virt_to_slab(objp); int nodeid = slabp->nodeid; - struct kmem_list3 *l3; + struct kmem_cache_node *n; struct array_cache *alien = NULL; int node; @@ -1212,10 +1114,10 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) if (likely(slabp->nodeid == node)) return 0; - l3 = cachep->nodelists[node]; + n = cachep->node[node]; STATS_INC_NODEFREES(cachep); - if (l3->alien && l3->alien[nodeid]) { - alien = l3->alien[nodeid]; + if (n->alien && n->alien[nodeid]) { + alien = n->alien[nodeid]; spin_lock(&alien->lock); if (unlikely(alien->avail == alien->limit)) { STATS_INC_ACOVERFLOW(cachep); @@ -1224,28 +1126,28 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) ac_put_obj(cachep, alien, objp); spin_unlock(&alien->lock); } else { - spin_lock(&(cachep->nodelists[nodeid])->list_lock); + spin_lock(&(cachep->node[nodeid])->list_lock); free_block(cachep, &objp, 1, nodeid); - spin_unlock(&(cachep->nodelists[nodeid])->list_lock); + spin_unlock(&(cachep->node[nodeid])->list_lock); } return 1; } #endif /* - * Allocates and initializes nodelists for a node on each slab cache, used for - * either memory or cpu hotplug. If memory is being hot-added, the kmem_list3 + * Allocates and initializes node for a node on each slab cache, used for + * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node * will be allocated off-node since memory is not yet online for the new node. - * When hotplugging memory or a cpu, existing nodelists are not replaced if + * When hotplugging memory or a cpu, existing node are not replaced if * already in use. * * Must hold slab_mutex. */ -static int init_cache_nodelists_node(int node) +static int init_cache_node_node(int node) { struct kmem_cache *cachep; - struct kmem_list3 *l3; - const int memsize = sizeof(struct kmem_list3); + struct kmem_cache_node *n; + const int memsize = sizeof(struct kmem_cache_node); list_for_each_entry(cachep, &slab_caches, list) { /* @@ -1253,12 +1155,12 @@ static int init_cache_nodelists_node(int node) * begin anything. Make sure some other cpu on this * node has not already allocated this */ - if (!cachep->nodelists[node]) { - l3 = kmalloc_node(memsize, GFP_KERNEL, node); - if (!l3) + if (!cachep->node[node]) { + n = kmalloc_node(memsize, GFP_KERNEL, node); + if (!n) return -ENOMEM; - kmem_list3_init(l3); - l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + + kmem_cache_node_init(n); + n->next_reap = jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; /* @@ -1266,14 +1168,14 @@ static int init_cache_nodelists_node(int node) * go. slab_mutex is sufficient * protection here. */ - cachep->nodelists[node] = l3; + cachep->node[node] = n; } - spin_lock_irq(&cachep->nodelists[node]->list_lock); - cachep->nodelists[node]->free_limit = + spin_lock_irq(&cachep->node[node]->list_lock); + cachep->node[node]->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&cachep->nodelists[node]->list_lock); + spin_unlock_irq(&cachep->node[node]->list_lock); } return 0; } @@ -1281,7 +1183,7 @@ static int init_cache_nodelists_node(int node) static void __cpuinit cpuup_canceled(long cpu) { struct kmem_cache *cachep; - struct kmem_list3 *l3 = NULL; + struct kmem_cache_node *n = NULL; int node = cpu_to_mem(cpu); const struct cpumask *mask = cpumask_of_node(node); @@ -1293,34 +1195,34 @@ static void __cpuinit cpuup_canceled(long cpu) /* cpu is dead; no one can alloc from it. */ nc = cachep->array[cpu]; cachep->array[cpu] = NULL; - l3 = cachep->nodelists[node]; + n = cachep->node[node]; - if (!l3) + if (!n) goto free_array_cache; - spin_lock_irq(&l3->list_lock); + spin_lock_irq(&n->list_lock); - /* Free limit for this kmem_list3 */ - l3->free_limit -= cachep->batchcount; + /* Free limit for this kmem_cache_node */ + n->free_limit -= cachep->batchcount; if (nc) free_block(cachep, nc->entry, nc->avail, node); if (!cpumask_empty(mask)) { - spin_unlock_irq(&l3->list_lock); + spin_unlock_irq(&n->list_lock); goto free_array_cache; } - shared = l3->shared; + shared = n->shared; if (shared) { free_block(cachep, shared->entry, shared->avail, node); - l3->shared = NULL; + n->shared = NULL; } - alien = l3->alien; - l3->alien = NULL; + alien = n->alien; + n->alien = NULL; - spin_unlock_irq(&l3->list_lock); + spin_unlock_irq(&n->list_lock); kfree(shared); if (alien) { @@ -1336,17 +1238,17 @@ free_array_cache: * shrink each nodelist to its limit. */ list_for_each_entry(cachep, &slab_caches, list) { - l3 = cachep->nodelists[node]; - if (!l3) + n = cachep->node[node]; + if (!n) continue; - drain_freelist(cachep, l3, l3->free_objects); + drain_freelist(cachep, n, n->free_objects); } } static int __cpuinit cpuup_prepare(long cpu) { struct kmem_cache *cachep; - struct kmem_list3 *l3 = NULL; + struct kmem_cache_node *n = NULL; int node = cpu_to_mem(cpu); int err; @@ -1354,9 +1256,9 @@ static int __cpuinit cpuup_prepare(long cpu) * We need to do this right in the beginning since * alloc_arraycache's are going to use this list. * kmalloc_node allows us to add the slab to the right - * kmem_list3 and not this cpu's kmem_list3 + * kmem_cache_node and not this cpu's kmem_cache_node */ - err = init_cache_nodelists_node(node); + err = init_cache_node_node(node); if (err < 0) goto bad; @@ -1391,25 +1293,25 @@ static int __cpuinit cpuup_prepare(long cpu) } } cachep->array[cpu] = nc; - l3 = cachep->nodelists[node]; - BUG_ON(!l3); + n = cachep->node[node]; + BUG_ON(!n); - spin_lock_irq(&l3->list_lock); - if (!l3->shared) { + spin_lock_irq(&n->list_lock); + if (!n->shared) { /* * We are serialised from CPU_DEAD or * CPU_UP_CANCELLED by the cpucontrol lock */ - l3->shared = shared; + n->shared = shared; shared = NULL; } #ifdef CONFIG_NUMA - if (!l3->alien) { - l3->alien = alien; + if (!n->alien) { + n->alien = alien; alien = NULL; } #endif - spin_unlock_irq(&l3->list_lock); + spin_unlock_irq(&n->list_lock); kfree(shared); free_alien_cache(alien); if (cachep->flags & SLAB_DEBUG_OBJECTS) @@ -1464,9 +1366,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, case CPU_DEAD_FROZEN: /* * Even if all the cpus of a node are down, we don't free the - * kmem_list3 of any cache. This to avoid a race between + * kmem_cache_node of any cache. This to avoid a race between * cpu_down, and a kmalloc allocation from another cpu for - * memory from the node of the cpu going down. The list3 + * memory from the node of the cpu going down. The node * structure is usually allocated from kmem_cache_create() and * gets destroyed at kmem_cache_destroy(). */ @@ -1494,22 +1396,22 @@ static struct notifier_block __cpuinitdata cpucache_notifier = { * * Must hold slab_mutex. */ -static int __meminit drain_cache_nodelists_node(int node) +static int __meminit drain_cache_node_node(int node) { struct kmem_cache *cachep; int ret = 0; list_for_each_entry(cachep, &slab_caches, list) { - struct kmem_list3 *l3; + struct kmem_cache_node *n; - l3 = cachep->nodelists[node]; - if (!l3) + n = cachep->node[node]; + if (!n) continue; - drain_freelist(cachep, l3, l3->free_objects); + drain_freelist(cachep, n, n->free_objects); - if (!list_empty(&l3->slabs_full) || - !list_empty(&l3->slabs_partial)) { + if (!list_empty(&n->slabs_full) || + !list_empty(&n->slabs_partial)) { ret = -EBUSY; break; } @@ -1531,12 +1433,12 @@ static int __meminit slab_memory_callback(struct notifier_block *self, switch (action) { case MEM_GOING_ONLINE: mutex_lock(&slab_mutex); - ret = init_cache_nodelists_node(nid); + ret = init_cache_node_node(nid); mutex_unlock(&slab_mutex); break; case MEM_GOING_OFFLINE: mutex_lock(&slab_mutex); - ret = drain_cache_nodelists_node(nid); + ret = drain_cache_node_node(nid); mutex_unlock(&slab_mutex); break; case MEM_ONLINE: @@ -1551,37 +1453,37 @@ out: #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ /* - * swap the static kmem_list3 with kmalloced memory + * swap the static kmem_cache_node with kmalloced memory */ -static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list, +static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list, int nodeid) { - struct kmem_list3 *ptr; + struct kmem_cache_node *ptr; - ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid); + ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid); BUG_ON(!ptr); - memcpy(ptr, list, sizeof(struct kmem_list3)); + memcpy(ptr, list, sizeof(struct kmem_cache_node)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->list_lock); MAKE_ALL_LISTS(cachep, ptr, nodeid); - cachep->nodelists[nodeid] = ptr; + cachep->node[nodeid] = ptr; } /* - * For setting up all the kmem_list3s for cache whose buffer_size is same as - * size of kmem_list3. + * For setting up all the kmem_cache_node for cache whose buffer_size is same as + * size of kmem_cache_node. */ -static void __init set_up_list3s(struct kmem_cache *cachep, int index) +static void __init set_up_node(struct kmem_cache *cachep, int index) { int node; for_each_online_node(node) { - cachep->nodelists[node] = &initkmem_list3[index + node]; - cachep->nodelists[node]->next_reap = jiffies + + cachep->node[node] = &init_kmem_cache_node[index + node]; + cachep->node[node]->next_reap = jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; } @@ -1589,11 +1491,11 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index) /* * The memory after the last cpu cache pointer is used for the - * the nodelists pointer. + * the node pointer. */ -static void setup_nodelists_pointer(struct kmem_cache *cachep) +static void setup_node_pointer(struct kmem_cache *cachep) { - cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; + cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids]; } /* @@ -1602,20 +1504,18 @@ static void setup_nodelists_pointer(struct kmem_cache *cachep) */ void __init kmem_cache_init(void) { - struct cache_sizes *sizes; - struct cache_names *names; int i; kmem_cache = &kmem_cache_boot; - setup_nodelists_pointer(kmem_cache); + setup_node_pointer(kmem_cache); if (num_possible_nodes() == 1) use_alien_caches = 0; for (i = 0; i < NUM_INIT_LISTS; i++) - kmem_list3_init(&initkmem_list3[i]); + kmem_cache_node_init(&init_kmem_cache_node[i]); - set_up_list3s(kmem_cache, CACHE_CACHE); + set_up_node(kmem_cache, CACHE_CACHE); /* * Fragmentation resistance on low memory - only use bigger @@ -1631,7 +1531,7 @@ void __init kmem_cache_init(void) * kmem_cache structures of all caches, except kmem_cache itself: * kmem_cache is statically allocated. * Initially an __init data area is used for the head array and the - * kmem_list3 structures, it's replaced with a kmalloc allocated + * kmem_cache_node structures, it's replaced with a kmalloc allocated * array at the end of the bootstrap. * 2) Create the first kmalloc cache. * The struct kmem_cache for the new cache is allocated normally. @@ -1640,7 +1540,7 @@ void __init kmem_cache_init(void) * head arrays. * 4) Replace the __init data head arrays for kmem_cache and the first * kmalloc cache with kmalloc allocated arrays. - * 5) Replace the __init data for kmem_list3 for kmem_cache and + * 5) Replace the __init data for kmem_cache_node for kmem_cache and * the other cache's with kmalloc allocated memory. * 6) Resize the head arrays of the kmalloc caches to their final sizes. */ @@ -1652,50 +1552,28 @@ void __init kmem_cache_init(void) */ create_boot_cache(kmem_cache, "kmem_cache", offsetof(struct kmem_cache, array[nr_cpu_ids]) + - nr_node_ids * sizeof(struct kmem_list3 *), + nr_node_ids * sizeof(struct kmem_cache_node *), SLAB_HWCACHE_ALIGN); list_add(&kmem_cache->list, &slab_caches); /* 2+3) create the kmalloc caches */ - sizes = malloc_sizes; - names = cache_names; /* * Initialize the caches that provide memory for the array cache and the - * kmem_list3 structures first. Without this, further allocations will + * kmem_cache_node structures first. Without this, further allocations will * bug. */ - sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name, - sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS); + kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac", + kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS); - if (INDEX_AC != INDEX_L3) - sizes[INDEX_L3].cs_cachep = - create_kmalloc_cache(names[INDEX_L3].name, - sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS); + if (INDEX_AC != INDEX_NODE) + kmalloc_caches[INDEX_NODE] = + create_kmalloc_cache("kmalloc-node", + kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); slab_early_init = 0; - while (sizes->cs_size != ULONG_MAX) { - /* - * For performance, all the general caches are L1 aligned. - * This should be particularly beneficial on SMP boxes, as it - * eliminates "false sharing". - * Note for systems short on memory removing the alignment will - * allow tighter packing of the smaller caches. - */ - if (!sizes->cs_cachep) - sizes->cs_cachep = create_kmalloc_cache(names->name, - sizes->cs_size, ARCH_KMALLOC_FLAGS); - -#ifdef CONFIG_ZONE_DMA - sizes->cs_dmacachep = create_kmalloc_cache( - names->name_dma, sizes->cs_size, - SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS); -#endif - sizes++; - names++; - } /* 4) Replace the bootstrap head arrays */ { struct array_cache *ptr; @@ -1713,36 +1591,35 @@ void __init kmem_cache_init(void) ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) + BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC]) != &initarray_generic.cache); - memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), + memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); - malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = - ptr; + kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; } - /* 5) Replace the bootstrap kmem_list3's */ + /* 5) Replace the bootstrap kmem_cache_node */ { int nid; for_each_online_node(nid) { - init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid); + init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); - init_list(malloc_sizes[INDEX_AC].cs_cachep, - &initkmem_list3[SIZE_AC + nid], nid); + init_list(kmalloc_caches[INDEX_AC], + &init_kmem_cache_node[SIZE_AC + nid], nid); - if (INDEX_AC != INDEX_L3) { - init_list(malloc_sizes[INDEX_L3].cs_cachep, - &initkmem_list3[SIZE_L3 + nid], nid); + if (INDEX_AC != INDEX_NODE) { + init_list(kmalloc_caches[INDEX_NODE], + &init_kmem_cache_node[SIZE_NODE + nid], nid); } } } - slab_state = UP; + create_kmalloc_caches(ARCH_KMALLOC_FLAGS); } void __init kmem_cache_init_late(void) @@ -1773,7 +1650,7 @@ void __init kmem_cache_init_late(void) #ifdef CONFIG_NUMA /* * Register a memory hotplug callback that initializes and frees - * nodelists. + * node. */ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); #endif @@ -1803,7 +1680,7 @@ __initcall(cpucache_init); static noinline void slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) { - struct kmem_list3 *l3; + struct kmem_cache_node *n; struct slab *slabp; unsigned long flags; int node; @@ -1818,24 +1695,24 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) unsigned long active_objs = 0, num_objs = 0, free_objects = 0; unsigned long active_slabs = 0, num_slabs = 0; - l3 = cachep->nodelists[node]; - if (!l3) + n = cachep->node[node]; + if (!n) continue; - spin_lock_irqsave(&l3->list_lock, flags); - list_for_each_entry(slabp, &l3->slabs_full, list) { + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(slabp, &n->slabs_full, list) { active_objs += cachep->num; active_slabs++; } - list_for_each_entry(slabp, &l3->slabs_partial, list) { + list_for_each_entry(slabp, &n->slabs_partial, list) { active_objs += slabp->inuse; active_slabs++; } - list_for_each_entry(slabp, &l3->slabs_free, list) + list_for_each_entry(slabp, &n->slabs_free, list) num_slabs++; - free_objects += l3->free_objects; - spin_unlock_irqrestore(&l3->list_lock, flags); + free_objects += n->free_objects; + spin_unlock_irqrestore(&n->list_lock, flags); num_slabs += active_slabs; num_objs = num_slabs * cachep->num; @@ -2258,7 +2135,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) if (slab_state == DOWN) { /* * Note: Creation of first cache (kmem_cache). - * The setup_list3s is taken care + * The setup_node is taken care * of by the caller of __kmem_cache_create */ cachep->array[smp_processor_id()] = &initarray_generic.cache; @@ -2272,13 +2149,13 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) cachep->array[smp_processor_id()] = &initarray_generic.cache; /* - * If the cache that's used by kmalloc(sizeof(kmem_list3)) is - * the second cache, then we need to set up all its list3s, + * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is + * the second cache, then we need to set up all its node/, * otherwise the creation of further caches will BUG(). */ - set_up_list3s(cachep, SIZE_AC); - if (INDEX_AC == INDEX_L3) - slab_state = PARTIAL_L3; + set_up_node(cachep, SIZE_AC); + if (INDEX_AC == INDEX_NODE) + slab_state = PARTIAL_NODE; else slab_state = PARTIAL_ARRAYCACHE; } else { @@ -2287,20 +2164,20 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) kmalloc(sizeof(struct arraycache_init), gfp); if (slab_state == PARTIAL_ARRAYCACHE) { - set_up_list3s(cachep, SIZE_L3); - slab_state = PARTIAL_L3; + set_up_node(cachep, SIZE_NODE); + slab_state = PARTIAL_NODE; } else { int node; for_each_online_node(node) { - cachep->nodelists[node] = - kmalloc_node(sizeof(struct kmem_list3), + cachep->node[node] = + kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); - BUG_ON(!cachep->nodelists[node]); - kmem_list3_init(cachep->nodelists[node]); + BUG_ON(!cachep->node[node]); + kmem_cache_node_init(cachep->node[node]); } } } - cachep->nodelists[numa_mem_id()]->next_reap = + cachep->node[numa_mem_id()]->next_reap = jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; @@ -2403,7 +2280,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) else gfp = GFP_NOWAIT; - setup_nodelists_pointer(cachep); + setup_node_pointer(cachep); #if DEBUG /* @@ -2426,7 +2303,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) size += BYTES_PER_WORD; } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) - if (size >= malloc_sizes[INDEX_L3 + 1].cs_size + if (size >= kmalloc_size(INDEX_NODE + 1) && cachep->object_size > cache_line_size() && ALIGN(size, cachep->align) < PAGE_SIZE) { cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); @@ -2497,7 +2374,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) cachep->reciprocal_buffer_size = reciprocal_value(size); if (flags & CFLGS_OFF_SLAB) { - cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); + cachep->slabp_cache = kmalloc_slab(slab_size, 0u); /* * This is a possibility for one of the malloc_sizes caches. * But since we go off slab only for object size greater than @@ -2543,7 +2420,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock); + assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock); #endif } @@ -2551,7 +2428,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->nodelists[node]->list_lock); + assert_spin_locked(&cachep->node[node]->list_lock); #endif } @@ -2562,7 +2439,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) #define check_spinlock_acquired_node(x, y) do { } while(0) #endif -static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, +static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, struct array_cache *ac, int force, int node); @@ -2574,29 +2451,29 @@ static void do_drain(void *arg) check_irq_off(); ac = cpu_cache_get(cachep); - spin_lock(&cachep->nodelists[node]->list_lock); + spin_lock(&cachep->node[node]->list_lock); free_block(cachep, ac->entry, ac->avail, node); - spin_unlock(&cachep->nodelists[node]->list_lock); + spin_unlock(&cachep->node[node]->list_lock); ac->avail = 0; } static void drain_cpu_caches(struct kmem_cache *cachep) { - struct kmem_list3 *l3; + struct kmem_cache_node *n; int node; on_each_cpu(do_drain, cachep, 1); check_irq_on(); for_each_online_node(node) { - l3 = cachep->nodelists[node]; - if (l3 && l3->alien) - drain_alien_cache(cachep, l3->alien); + n = cachep->node[node]; + if (n && n->alien) + drain_alien_cache(cachep, n->alien); } for_each_online_node(node) { - l3 = cachep->nodelists[node]; - if (l3) - drain_array(cachep, l3, l3->shared, 1, node); + n = cachep->node[node]; + if (n) + drain_array(cachep, n, n->shared, 1, node); } } @@ -2607,19 +2484,19 @@ static void drain_cpu_caches(struct kmem_cache *cachep) * Returns the actual number of slabs released. */ static int drain_freelist(struct kmem_cache *cache, - struct kmem_list3 *l3, int tofree) + struct kmem_cache_node *n, int tofree) { struct list_head *p; int nr_freed; struct slab *slabp; nr_freed = 0; - while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { + while (nr_freed < tofree && !list_empty(&n->slabs_free)) { - spin_lock_irq(&l3->list_lock); - p = l3->slabs_free.prev; - if (p == &l3->slabs_free) { - spin_unlock_irq(&l3->list_lock); + spin_lock_irq(&n->list_lock); + p = n->slabs_free.prev; + if (p == &n->slabs_free) { + spin_unlock_irq(&n->list_lock); goto out; } @@ -2632,8 +2509,8 @@ static int drain_freelist(struct kmem_cache *cache, * Safe to drop the lock. The slab is no longer linked * to the cache. */ - l3->free_objects -= cache->num; - spin_unlock_irq(&l3->list_lock); + n->free_objects -= cache->num; + spin_unlock_irq(&n->list_lock); slab_destroy(cache, slabp); nr_freed++; } @@ -2645,20 +2522,20 @@ out: static int __cache_shrink(struct kmem_cache *cachep) { int ret = 0, i = 0; - struct kmem_list3 *l3; + struct kmem_cache_node *n; drain_cpu_caches(cachep); check_irq_on(); for_each_online_node(i) { - l3 = cachep->nodelists[i]; - if (!l3) + n = cachep->node[i]; + if (!n) continue; - drain_freelist(cachep, l3, l3->free_objects); + drain_freelist(cachep, n, n->free_objects); - ret += !list_empty(&l3->slabs_full) || - !list_empty(&l3->slabs_partial); + ret += !list_empty(&n->slabs_full) || + !list_empty(&n->slabs_partial); } return (ret ? 1 : 0); } @@ -2687,7 +2564,7 @@ EXPORT_SYMBOL(kmem_cache_shrink); int __kmem_cache_shutdown(struct kmem_cache *cachep) { int i; - struct kmem_list3 *l3; + struct kmem_cache_node *n; int rc = __cache_shrink(cachep); if (rc) @@ -2696,13 +2573,13 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) for_each_online_cpu(i) kfree(cachep->array[i]); - /* NUMA: free the list3 structures */ + /* NUMA: free the node structures */ for_each_online_node(i) { - l3 = cachep->nodelists[i]; - if (l3) { - kfree(l3->shared); - free_alien_cache(l3->alien); - kfree(l3); + n = cachep->node[i]; + if (n) { + kfree(n->shared); + free_alien_cache(n->alien); + kfree(n); } } return 0; @@ -2884,7 +2761,7 @@ static int cache_grow(struct kmem_cache *cachep, struct slab *slabp; size_t offset; gfp_t local_flags; - struct kmem_list3 *l3; + struct kmem_cache_node *n; /* * Be lazy and only check for valid flags here, keeping it out of the @@ -2893,17 +2770,17 @@ static int cache_grow(struct kmem_cache *cachep, BUG_ON(flags & GFP_SLAB_BUG_MASK); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); - /* Take the l3 list lock to change the colour_next on this node */ + /* Take the node list lock to change the colour_next on this node */ check_irq_off(); - l3 = cachep->nodelists[nodeid]; - spin_lock(&l3->list_lock); + n = cachep->node[nodeid]; + spin_lock(&n->list_lock); /* Get colour for the slab, and cal the next value. */ - offset = l3->colour_next; - l3->colour_next++; - if (l3->colour_next >= cachep->colour) - l3->colour_next = 0; - spin_unlock(&l3->list_lock); + offset = n->colour_next; + n->colour_next++; + if (n->colour_next >= cachep->colour) + n->colour_next = 0; + spin_unlock(&n->list_lock); offset *= cachep->colour_off; @@ -2940,13 +2817,13 @@ static int cache_grow(struct kmem_cache *cachep, if (local_flags & __GFP_WAIT) local_irq_disable(); check_irq_off(); - spin_lock(&l3->list_lock); + spin_lock(&n->list_lock); /* Make slab active. */ - list_add_tail(&slabp->list, &(l3->slabs_free)); + list_add_tail(&slabp->list, &(n->slabs_free)); STATS_INC_GROWN(cachep); - l3->free_objects += cachep->num; - spin_unlock(&l3->list_lock); + n->free_objects += cachep->num; + spin_unlock(&n->list_lock); return 1; opps1: kmem_freepages(cachep, objp); @@ -3074,7 +2951,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, bool force_refill) { int batchcount; - struct kmem_list3 *l3; + struct kmem_cache_node *n; struct array_cache *ac; int node; @@ -3093,14 +2970,14 @@ retry: */ batchcount = BATCHREFILL_LIMIT; } - l3 = cachep->nodelists[node]; + n = cachep->node[node]; - BUG_ON(ac->avail > 0 || !l3); - spin_lock(&l3->list_lock); + BUG_ON(ac->avail > 0 || !n); + spin_lock(&n->list_lock); /* See if we can refill from the shared array */ - if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) { - l3->shared->touched = 1; + if (n->shared && transfer_objects(ac, n->shared, batchcount)) { + n->shared->touched = 1; goto alloc_done; } @@ -3108,11 +2985,11 @@ retry: struct list_head *entry; struct slab *slabp; /* Get slab alloc is to come from. */ - entry = l3->slabs_partial.next; - if (entry == &l3->slabs_partial) { - l3->free_touched = 1; - entry = l3->slabs_free.next; - if (entry == &l3->slabs_free) + entry = n->slabs_partial.next; + if (entry == &n->slabs_partial) { + n->free_touched = 1; + entry = n->slabs_free.next; + if (entry == &n->slabs_free) goto must_grow; } @@ -3140,15 +3017,15 @@ retry: /* move slabp to correct slabp list: */ list_del(&slabp->list); if (slabp->free == BUFCTL_END) - list_add(&slabp->list, &l3->slabs_full); + list_add(&slabp->list, &n->slabs_full); else - list_add(&slabp->list, &l3->slabs_partial); + list_add(&slabp->list, &n->slabs_partial); } must_grow: - l3->free_objects -= ac->avail; + n->free_objects -= ac->avail; alloc_done: - spin_unlock(&l3->list_lock); + spin_unlock(&n->list_lock); if (unlikely(!ac->avail)) { int x; @@ -3315,7 +3192,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) /* * Fallback function if there was no memory available and no objects on a * certain node and fall back is permitted. First we scan all the - * available nodelists for available objects. If that fails then we + * available node for available objects. If that fails then we * perform an allocation without specifying a node. This allows the page * allocator to do its reclaim / fallback magic. We then insert the * slab into the proper nodelist and then allocate from it. @@ -3349,8 +3226,8 @@ retry: nid = zone_to_nid(zone); if (cpuset_zone_allowed_hardwall(zone, flags) && - cache->nodelists[nid] && - cache->nodelists[nid]->free_objects) { + cache->node[nid] && + cache->node[nid]->free_objects) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); if (obj) @@ -3406,21 +3283,22 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, { struct list_head *entry; struct slab *slabp; - struct kmem_list3 *l3; + struct kmem_cache_node *n; void *obj; int x; - l3 = cachep->nodelists[nodeid]; - BUG_ON(!l3); + VM_BUG_ON(nodeid > num_online_nodes()); + n = cachep->node[nodeid]; + BUG_ON(!n); retry: check_irq_off(); - spin_lock(&l3->list_lock); - entry = l3->slabs_partial.next; - if (entry == &l3->slabs_partial) { - l3->free_touched = 1; - entry = l3->slabs_free.next; - if (entry == &l3->slabs_free) + spin_lock(&n->list_lock); + entry = n->slabs_partial.next; + if (entry == &n->slabs_partial) { + n->free_touched = 1; + entry = n->slabs_free.next; + if (entry == &n->slabs_free) goto must_grow; } @@ -3436,20 +3314,20 @@ retry: obj = slab_get_obj(cachep, slabp, nodeid); check_slabp(cachep, slabp); - l3->free_objects--; + n->free_objects--; /* move slabp to correct slabp list: */ list_del(&slabp->list); if (slabp->free == BUFCTL_END) - list_add(&slabp->list, &l3->slabs_full); + list_add(&slabp->list, &n->slabs_full); else - list_add(&slabp->list, &l3->slabs_partial); + list_add(&slabp->list, &n->slabs_partial); - spin_unlock(&l3->list_lock); + spin_unlock(&n->list_lock); goto done; must_grow: - spin_unlock(&l3->list_lock); + spin_unlock(&n->list_lock); x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); if (x) goto retry; @@ -3495,7 +3373,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (nodeid == NUMA_NO_NODE) nodeid = slab_node; - if (unlikely(!cachep->nodelists[nodeid])) { + if (unlikely(!cachep->node[nodeid])) { /* Node not bootstrapped yet */ ptr = fallback_alloc(cachep, flags); goto out; @@ -3601,7 +3479,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, int node) { int i; - struct kmem_list3 *l3; + struct kmem_cache_node *n; for (i = 0; i < nr_objects; i++) { void *objp; @@ -3611,19 +3489,19 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, objp = objpp[i]; slabp = virt_to_slab(objp); - l3 = cachep->nodelists[node]; + n = cachep->node[node]; list_del(&slabp->list); check_spinlock_acquired_node(cachep, node); check_slabp(cachep, slabp); slab_put_obj(cachep, slabp, objp, node); STATS_DEC_ACTIVE(cachep); - l3->free_objects++; + n->free_objects++; check_slabp(cachep, slabp); /* fixup slab chains */ if (slabp->inuse == 0) { - if (l3->free_objects > l3->free_limit) { - l3->free_objects -= cachep->num; + if (n->free_objects > n->free_limit) { + n->free_objects -= cachep->num; /* No need to drop any previously held * lock here, even if we have a off-slab slab * descriptor it is guaranteed to come from @@ -3632,14 +3510,14 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, */ slab_destroy(cachep, slabp); } else { - list_add(&slabp->list, &l3->slabs_free); + list_add(&slabp->list, &n->slabs_free); } } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. */ - list_add_tail(&slabp->list, &l3->slabs_partial); + list_add_tail(&slabp->list, &n->slabs_partial); } } } @@ -3647,7 +3525,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) { int batchcount; - struct kmem_list3 *l3; + struct kmem_cache_node *n; int node = numa_mem_id(); batchcount = ac->batchcount; @@ -3655,10 +3533,10 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) BUG_ON(!batchcount || batchcount > ac->avail); #endif check_irq_off(); - l3 = cachep->nodelists[node]; - spin_lock(&l3->list_lock); - if (l3->shared) { - struct array_cache *shared_array = l3->shared; + n = cachep->node[node]; + spin_lock(&n->list_lock); + if (n->shared) { + struct array_cache *shared_array = n->shared; int max = shared_array->limit - shared_array->avail; if (max) { if (batchcount > max) @@ -3677,8 +3555,8 @@ free_done: int i = 0; struct list_head *p; - p = l3->slabs_free.next; - while (p != &(l3->slabs_free)) { + p = n->slabs_free.next; + while (p != &(n->slabs_free)) { struct slab *slabp; slabp = list_entry(p, struct slab, list); @@ -3690,7 +3568,7 @@ free_done: STATS_SET_FREEABLE(cachep, i); } #endif - spin_unlock(&l3->list_lock); + spin_unlock(&n->list_lock); ac->avail -= batchcount; memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); } @@ -3800,7 +3678,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) { struct kmem_cache *cachep; - cachep = kmem_find_general_cachep(size, flags); + cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; return kmem_cache_alloc_node_trace(cachep, flags, node, size); @@ -3845,7 +3723,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, * Then kmalloc uses the uninlined functions instead of the inline * functions. */ - cachep = __find_general_cachep(size, flags); + cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; ret = slab_alloc(cachep, flags, caller); @@ -3934,12 +3812,12 @@ void kfree(const void *objp) EXPORT_SYMBOL(kfree); /* - * This initializes kmem_list3 or resizes various caches for all nodes. + * This initializes kmem_cache_node or resizes various caches for all nodes. */ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) { int node; - struct kmem_list3 *l3; + struct kmem_cache_node *n; struct array_cache *new_shared; struct array_cache **new_alien = NULL; @@ -3962,43 +3840,43 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) } } - l3 = cachep->nodelists[node]; - if (l3) { - struct array_cache *shared = l3->shared; + n = cachep->node[node]; + if (n) { + struct array_cache *shared = n->shared; - spin_lock_irq(&l3->list_lock); + spin_lock_irq(&n->list_lock); if (shared) free_block(cachep, shared->entry, shared->avail, node); - l3->shared = new_shared; - if (!l3->alien) { - l3->alien = new_alien; + n->shared = new_shared; + if (!n->alien) { + n->alien = new_alien; new_alien = NULL; } - l3->free_limit = (1 + nr_cpus_node(node)) * + n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&l3->list_lock); + spin_unlock_irq(&n->list_lock); kfree(shared); free_alien_cache(new_alien); continue; } - l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node); - if (!l3) { + n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); + if (!n) { free_alien_cache(new_alien); kfree(new_shared); goto fail; } - kmem_list3_init(l3); - l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + + kmem_cache_node_init(n); + n->next_reap = jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - l3->shared = new_shared; - l3->alien = new_alien; - l3->free_limit = (1 + nr_cpus_node(node)) * + n->shared = new_shared; + n->alien = new_alien; + n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - cachep->nodelists[node] = l3; + cachep->node[node] = n; } return 0; @@ -4007,13 +3885,13 @@ fail: /* Cache is not active yet. Roll back what we did */ node--; while (node >= 0) { - if (cachep->nodelists[node]) { - l3 = cachep->nodelists[node]; + if (cachep->node[node]) { + n = cachep->node[node]; - kfree(l3->shared); - free_alien_cache(l3->alien); - kfree(l3); - cachep->nodelists[node] = NULL; + kfree(n->shared); + free_alien_cache(n->alien); + kfree(n); + cachep->node[node] = NULL; } node--; } @@ -4073,9 +3951,9 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, struct array_cache *ccold = new->new[i]; if (!ccold) continue; - spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); + spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); - spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); + spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); kfree(ccold); } kfree(new); @@ -4176,11 +4054,11 @@ skip_setup: } /* - * Drain an array if it contains any elements taking the l3 lock only if - * necessary. Note that the l3 listlock also protects the array_cache + * Drain an array if it contains any elements taking the node lock only if + * necessary. Note that the node listlock also protects the array_cache * if drain_array() is used on the shared array. */ -static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, +static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, struct array_cache *ac, int force, int node) { int tofree; @@ -4190,7 +4068,7 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, if (ac->touched && !force) { ac->touched = 0; } else { - spin_lock_irq(&l3->list_lock); + spin_lock_irq(&n->list_lock); if (ac->avail) { tofree = force ? ac->avail : (ac->limit + 4) / 5; if (tofree > ac->avail) @@ -4200,7 +4078,7 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); } - spin_unlock_irq(&l3->list_lock); + spin_unlock_irq(&n->list_lock); } } @@ -4219,7 +4097,7 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, static void cache_reap(struct work_struct *w) { struct kmem_cache *searchp; - struct kmem_list3 *l3; + struct kmem_cache_node *n; int node = numa_mem_id(); struct delayed_work *work = to_delayed_work(w); @@ -4231,33 +4109,33 @@ static void cache_reap(struct work_struct *w) check_irq_on(); /* - * We only take the l3 lock if absolutely necessary and we + * We only take the node lock if absolutely necessary and we * have established with reasonable certainty that * we can do some work if the lock was obtained. */ - l3 = searchp->nodelists[node]; + n = searchp->node[node]; - reap_alien(searchp, l3); + reap_alien(searchp, n); - drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); + drain_array(searchp, n, cpu_cache_get(searchp), 0, node); /* * These are racy checks but it does not matter * if we skip one check or scan twice. */ - if (time_after(l3->next_reap, jiffies)) + if (time_after(n->next_reap, jiffies)) goto next; - l3->next_reap = jiffies + REAPTIMEOUT_LIST3; + n->next_reap = jiffies + REAPTIMEOUT_LIST3; - drain_array(searchp, l3, l3->shared, 0, node); + drain_array(searchp, n, n->shared, 0, node); - if (l3->free_touched) - l3->free_touched = 0; + if (n->free_touched) + n->free_touched = 0; else { int freed; - freed = drain_freelist(searchp, l3, (l3->free_limit + + freed = drain_freelist(searchp, n, (n->free_limit + 5 * searchp->num - 1) / (5 * searchp->num)); STATS_ADD_REAPED(searchp, freed); } @@ -4283,25 +4161,25 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) const char *name; char *error = NULL; int node; - struct kmem_list3 *l3; + struct kmem_cache_node *n; active_objs = 0; num_slabs = 0; for_each_online_node(node) { - l3 = cachep->nodelists[node]; - if (!l3) + n = cachep->node[node]; + if (!n) continue; check_irq_on(); - spin_lock_irq(&l3->list_lock); + spin_lock_irq(&n->list_lock); - list_for_each_entry(slabp, &l3->slabs_full, list) { + list_for_each_entry(slabp, &n->slabs_full, list) { if (slabp->inuse != cachep->num && !error) error = "slabs_full accounting error"; active_objs += cachep->num; active_slabs++; } - list_for_each_entry(slabp, &l3->slabs_partial, list) { + list_for_each_entry(slabp, &n->slabs_partial, list) { if (slabp->inuse == cachep->num && !error) error = "slabs_partial inuse accounting error"; if (!slabp->inuse && !error) @@ -4309,16 +4187,16 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) active_objs += slabp->inuse; active_slabs++; } - list_for_each_entry(slabp, &l3->slabs_free, list) { + list_for_each_entry(slabp, &n->slabs_free, list) { if (slabp->inuse && !error) error = "slabs_free/inuse accounting error"; num_slabs++; } - free_objects += l3->free_objects; - if (l3->shared) - shared_avail += l3->shared->avail; + free_objects += n->free_objects; + if (n->shared) + shared_avail += n->shared->avail; - spin_unlock_irq(&l3->list_lock); + spin_unlock_irq(&n->list_lock); } num_slabs += active_slabs; num_objs = num_slabs * cachep->num; @@ -4344,7 +4222,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) { #if STATS - { /* list3 stats */ + { /* node stats */ unsigned long high = cachep->high_mark; unsigned long allocs = cachep->num_allocations; unsigned long grown = cachep->grown; @@ -4497,9 +4375,9 @@ static int leaks_show(struct seq_file *m, void *p) { struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); struct slab *slabp; - struct kmem_list3 *l3; + struct kmem_cache_node *n; const char *name; - unsigned long *n = m->private; + unsigned long *x = m->private; int node; int i; @@ -4510,43 +4388,43 @@ static int leaks_show(struct seq_file *m, void *p) /* OK, we can do it */ - n[1] = 0; + x[1] = 0; for_each_online_node(node) { - l3 = cachep->nodelists[node]; - if (!l3) + n = cachep->node[node]; + if (!n) continue; check_irq_on(); - spin_lock_irq(&l3->list_lock); + spin_lock_irq(&n->list_lock); - list_for_each_entry(slabp, &l3->slabs_full, list) - handle_slab(n, cachep, slabp); - list_for_each_entry(slabp, &l3->slabs_partial, list) - handle_slab(n, cachep, slabp); - spin_unlock_irq(&l3->list_lock); + list_for_each_entry(slabp, &n->slabs_full, list) + handle_slab(x, cachep, slabp); + list_for_each_entry(slabp, &n->slabs_partial, list) + handle_slab(x, cachep, slabp); + spin_unlock_irq(&n->list_lock); } name = cachep->name; - if (n[0] == n[1]) { + if (x[0] == x[1]) { /* Increase the buffer size */ mutex_unlock(&slab_mutex); - m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); + m->private = kzalloc(x[0] * 4 * sizeof(unsigned long), GFP_KERNEL); if (!m->private) { /* Too bad, we are really out */ - m->private = n; + m->private = x; mutex_lock(&slab_mutex); return -ENOMEM; } - *(unsigned long *)m->private = n[0] * 2; - kfree(n); + *(unsigned long *)m->private = x[0] * 2; + kfree(x); mutex_lock(&slab_mutex); /* Now make sure this entry will be retried */ m->count = m->size; return 0; } - for (i = 0; i < n[1]; i++) { - seq_printf(m, "%s: %lu ", name, n[2*i+3]); - show_symbol(m, n[2*i+2]); + for (i = 0; i < x[1]; i++) { + seq_printf(m, "%s: %lu ", name, x[2*i+3]); + show_symbol(m, x[2*i+2]); seq_putc(m, '\n'); } diff --git a/mm/slab.h b/mm/slab.h index 34a98d642196..f96b49e4704e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -16,7 +16,7 @@ enum slab_state { DOWN, /* No slab functionality yet */ PARTIAL, /* SLUB: kmem_cache_node available */ PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */ - PARTIAL_L3, /* SLAB: kmalloc size for l3 struct available */ + PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ UP, /* Slab caches usable but not all extras yet */ FULL /* Everything is working */ }; @@ -35,6 +35,15 @@ extern struct kmem_cache *kmem_cache; unsigned long calculate_alignment(unsigned long flags, unsigned long align, unsigned long size); +#ifndef CONFIG_SLOB +/* Kmalloc array related functions */ +void create_kmalloc_caches(unsigned long); + +/* Find the kmalloc slab corresponding for a certain size */ +struct kmem_cache *kmalloc_slab(size_t, gfp_t); +#endif + + /* Functions provided by the slab allocators */ extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); @@ -230,3 +239,35 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return s; } #endif + + +/* + * The slab lists for all objects. + */ +struct kmem_cache_node { + spinlock_t list_lock; + +#ifdef CONFIG_SLAB + struct list_head slabs_partial; /* partial list first, better asm code */ + struct list_head slabs_full; + struct list_head slabs_free; + unsigned long free_objects; + unsigned int free_limit; + unsigned int colour_next; /* Per-node cache coloring */ + struct array_cache *shared; /* shared per node */ + struct array_cache **alien; /* on other nodes */ + unsigned long next_reap; /* updated without locking */ + int free_touched; /* updated without locking */ +#endif + +#ifdef CONFIG_SLUB + unsigned long nr_partial; + struct list_head partial; +#ifdef CONFIG_SLUB_DEBUG + atomic_long_t nr_slabs; + atomic_long_t total_objects; + struct list_head full; +#endif +#endif + +}; diff --git a/mm/slab_common.c b/mm/slab_common.c index 3f3cd97d3fdf..ff3218a0f5e1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -299,7 +299,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz err = __kmem_cache_create(s, flags); if (err) - panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n", + panic("Creation of kmalloc slab %s size=%zu failed. Reason %d\n", name, size, err); s->refcount = -1; /* Exempt from merging for now */ @@ -319,6 +319,178 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, return s; } +struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; +EXPORT_SYMBOL(kmalloc_caches); + +#ifdef CONFIG_ZONE_DMA +struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; +EXPORT_SYMBOL(kmalloc_dma_caches); +#endif + +/* + * Conversion table for small slabs sizes / 8 to the index in the + * kmalloc array. This is necessary for slabs < 192 since we have non power + * of two cache sizes there. The size of larger slabs can be determined using + * fls. + */ +static s8 size_index[24] = { + 3, /* 8 */ + 4, /* 16 */ + 5, /* 24 */ + 5, /* 32 */ + 6, /* 40 */ + 6, /* 48 */ + 6, /* 56 */ + 6, /* 64 */ + 1, /* 72 */ + 1, /* 80 */ + 1, /* 88 */ + 1, /* 96 */ + 7, /* 104 */ + 7, /* 112 */ + 7, /* 120 */ + 7, /* 128 */ + 2, /* 136 */ + 2, /* 144 */ + 2, /* 152 */ + 2, /* 160 */ + 2, /* 168 */ + 2, /* 176 */ + 2, /* 184 */ + 2 /* 192 */ +}; + +static inline int size_index_elem(size_t bytes) +{ + return (bytes - 1) / 8; +} + +/* + * Find the kmem_cache structure that serves a given size of + * allocation + */ +struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) +{ + int index; + + if (WARN_ON_ONCE(size > KMALLOC_MAX_SIZE)) + return NULL; + + if (size <= 192) { + if (!size) + return ZERO_SIZE_PTR; + + index = size_index[size_index_elem(size)]; + } else + index = fls(size - 1); + +#ifdef CONFIG_ZONE_DMA + if (unlikely((flags & GFP_DMA))) + return kmalloc_dma_caches[index]; + +#endif + return kmalloc_caches[index]; +} + +/* + * Create the kmalloc array. Some of the regular kmalloc arrays + * may already have been created because they were needed to + * enable allocations for slab creation. + */ +void __init create_kmalloc_caches(unsigned long flags) +{ + int i; + + /* + * Patch up the size_index table if we have strange large alignment + * requirements for the kmalloc array. This is only the case for + * MIPS it seems. The standard arches will not generate any code here. + * + * Largest permitted alignment is 256 bytes due to the way we + * handle the index determination for the smaller caches. + * + * Make sure that nothing crazy happens if someone starts tinkering + * around with ARCH_KMALLOC_MINALIGN + */ + BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || + (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); + + for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { + int elem = size_index_elem(i); + + if (elem >= ARRAY_SIZE(size_index)) + break; + size_index[elem] = KMALLOC_SHIFT_LOW; + } + + if (KMALLOC_MIN_SIZE >= 64) { + /* + * The 96 byte size cache is not used if the alignment + * is 64 byte. + */ + for (i = 64 + 8; i <= 96; i += 8) + size_index[size_index_elem(i)] = 7; + + } + + if (KMALLOC_MIN_SIZE >= 128) { + /* + * The 192 byte sized cache is not used if the alignment + * is 128 byte. Redirect kmalloc to use the 256 byte cache + * instead. + */ + for (i = 128 + 8; i <= 192; i += 8) + size_index[size_index_elem(i)] = 8; + } + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { + if (!kmalloc_caches[i]) { + kmalloc_caches[i] = create_kmalloc_cache(NULL, + 1 << i, flags); + } + + /* + * Caches that are not of the two-to-the-power-of size. + * These have to be created immediately after the + * earlier power of two caches + */ + if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) + kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags); + + if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) + kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags); + } + + /* Kmalloc array is now usable */ + slab_state = UP; + + for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { + struct kmem_cache *s = kmalloc_caches[i]; + char *n; + + if (s) { + n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i)); + + BUG_ON(!n); + s->name = n; + } + } + +#ifdef CONFIG_ZONE_DMA + for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { + struct kmem_cache *s = kmalloc_caches[i]; + + if (s) { + int size = kmalloc_size(i); + char *n = kasprintf(GFP_NOWAIT, + "dma-kmalloc-%d", size); + + BUG_ON(!n); + kmalloc_dma_caches[i] = create_kmalloc_cache(n, + size, SLAB_CACHE_DMA | flags); + } + } +#endif +} #endif /* !CONFIG_SLOB */ diff --git a/mm/slub.c b/mm/slub.c index a0206df88aba..57707f01bcfb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1006,7 +1006,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) * dilemma by deferring the increment of the count during * bootstrap (see early_kmem_cache_node_alloc). */ - if (n) { + if (likely(n)) { atomic_long_inc(&n->nr_slabs); atomic_long_add(objects, &n->total_objects); } @@ -1494,7 +1494,7 @@ static inline void remove_partial(struct kmem_cache_node *n, */ static inline void *acquire_slab(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page, - int mode) + int mode, int *objects) { void *freelist; unsigned long counters; @@ -1508,6 +1508,7 @@ static inline void *acquire_slab(struct kmem_cache *s, freelist = page->freelist; counters = page->counters; new.counters = counters; + *objects = new.objects - new.inuse; if (mode) { new.inuse = page->objects; new.freelist = NULL; @@ -1529,7 +1530,7 @@ static inline void *acquire_slab(struct kmem_cache *s, return freelist; } -static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); +static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); /* @@ -1540,6 +1541,8 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, { struct page *page, *page2; void *object = NULL; + int available = 0; + int objects; /* * Racy check. If we mistakenly see no partial slabs then we @@ -1553,22 +1556,21 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, spin_lock(&n->list_lock); list_for_each_entry_safe(page, page2, &n->partial, lru) { void *t; - int available; if (!pfmemalloc_match(page, flags)) continue; - t = acquire_slab(s, n, page, object == NULL); + t = acquire_slab(s, n, page, object == NULL, &objects); if (!t) break; + available += objects; if (!object) { c->page = page; stat(s, ALLOC_FROM_PARTIAL); object = t; - available = page->objects - page->inuse; } else { - available = put_cpu_partial(s, page, 0); + put_cpu_partial(s, page, 0); stat(s, CPU_PARTIAL_NODE); } if (kmem_cache_debug(s) || available > s->cpu_partial / 2) @@ -1947,7 +1949,7 @@ static void unfreeze_partials(struct kmem_cache *s, * If we did not find a slot then simply move all the partials to the * per node partial list. */ -static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) +static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) { struct page *oldpage; int pages; @@ -1985,7 +1987,6 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) page->next = oldpage; } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); - return pobjects; } static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) @@ -2042,7 +2043,7 @@ static void flush_all(struct kmem_cache *s) static inline int node_match(struct page *page, int node) { #ifdef CONFIG_NUMA - if (node != NUMA_NO_NODE && page_to_nid(page) != node) + if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node)) return 0; #endif return 1; @@ -2332,13 +2333,18 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, s = memcg_kmem_get_cache(s, gfpflags); redo: - /* * Must read kmem_cache cpu data via this cpu ptr. Preemption is * enabled. We may switch back and forth between cpus while * reading from one cpu area. That does not matter as long * as we end up on the original cpu again when doing the cmpxchg. + * + * Preemption is disabled for the retrieval of the tid because that + * must occur from the current processor. We cannot allow rescheduling + * on a different processor between the determination of the pointer + * and the retrieval of the tid. */ + preempt_disable(); c = __this_cpu_ptr(s->cpu_slab); /* @@ -2348,7 +2354,7 @@ redo: * linked list in between. */ tid = c->tid; - barrier(); + preempt_enable(); object = c->freelist; page = c->page; @@ -2595,10 +2601,11 @@ redo: * data is retrieved via this pointer. If we are on the same cpu * during the cmpxchg then the free will succedd. */ + preempt_disable(); c = __this_cpu_ptr(s->cpu_slab); tid = c->tid; - barrier(); + preempt_enable(); if (likely(page == c->page)) { set_freepointer(s, object, c->freelist); @@ -2776,7 +2783,7 @@ init_kmem_cache_node(struct kmem_cache_node *n) static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < - SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); + KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu)); /* * Must align to double word boundary for the double cmpxchg @@ -2983,7 +2990,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) s->allocflags |= __GFP_COMP; if (s->flags & SLAB_CACHE_DMA) - s->allocflags |= SLUB_DMA; + s->allocflags |= GFP_DMA; if (s->flags & SLAB_RECLAIM_ACCOUNT) s->allocflags |= __GFP_RECLAIMABLE; @@ -3175,13 +3182,6 @@ int __kmem_cache_shutdown(struct kmem_cache *s) * Kmalloc subsystem *******************************************************************/ -struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; -EXPORT_SYMBOL(kmalloc_caches); - -#ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; -#endif - static int __init setup_slub_min_order(char *str) { get_option(&str, &slub_min_order); @@ -3218,73 +3218,15 @@ static int __init setup_slub_nomerge(char *str) __setup("slub_nomerge", setup_slub_nomerge); -/* - * Conversion table for small slabs sizes / 8 to the index in the - * kmalloc array. This is necessary for slabs < 192 since we have non power - * of two cache sizes there. The size of larger slabs can be determined using - * fls. - */ -static s8 size_index[24] = { - 3, /* 8 */ - 4, /* 16 */ - 5, /* 24 */ - 5, /* 32 */ - 6, /* 40 */ - 6, /* 48 */ - 6, /* 56 */ - 6, /* 64 */ - 1, /* 72 */ - 1, /* 80 */ - 1, /* 88 */ - 1, /* 96 */ - 7, /* 104 */ - 7, /* 112 */ - 7, /* 120 */ - 7, /* 128 */ - 2, /* 136 */ - 2, /* 144 */ - 2, /* 152 */ - 2, /* 160 */ - 2, /* 168 */ - 2, /* 176 */ - 2, /* 184 */ - 2 /* 192 */ -}; - -static inline int size_index_elem(size_t bytes) -{ - return (bytes - 1) / 8; -} - -static struct kmem_cache *get_slab(size_t size, gfp_t flags) -{ - int index; - - if (size <= 192) { - if (!size) - return ZERO_SIZE_PTR; - - index = size_index[size_index_elem(size)]; - } else - index = fls(size - 1); - -#ifdef CONFIG_ZONE_DMA - if (unlikely((flags & SLUB_DMA))) - return kmalloc_dma_caches[index]; - -#endif - return kmalloc_caches[index]; -} - void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; void *ret; - if (unlikely(size > SLUB_MAX_SIZE)) + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) return kmalloc_large(size, flags); - s = get_slab(size, flags); + s = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; @@ -3317,7 +3259,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) struct kmem_cache *s; void *ret; - if (unlikely(size > SLUB_MAX_SIZE)) { + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { ret = kmalloc_large_node(size, flags, node); trace_kmalloc_node(_RET_IP_, ret, @@ -3327,7 +3269,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) return ret; } - s = get_slab(size, flags); + s = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; @@ -3620,6 +3562,12 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) memcpy(s, static_cache, kmem_cache->object_size); + /* + * This runs very early, and only the boot processor is supposed to be + * up. Even if it weren't true, IRQs are not up so we couldn't fire + * IPIs around. + */ + __flush_cpu_slab(s, smp_processor_id()); for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); struct page *p; @@ -3642,8 +3590,6 @@ void __init kmem_cache_init(void) { static __initdata struct kmem_cache boot_kmem_cache, boot_kmem_cache_node; - int i; - int caches = 2; if (debug_guardpage_minorder()) slub_max_order = 0; @@ -3674,103 +3620,16 @@ void __init kmem_cache_init(void) kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ - - /* - * Patch up the size_index table if we have strange large alignment - * requirements for the kmalloc array. This is only the case for - * MIPS it seems. The standard arches will not generate any code here. - * - * Largest permitted alignment is 256 bytes due to the way we - * handle the index determination for the smaller caches. - * - * Make sure that nothing crazy happens if someone starts tinkering - * around with ARCH_KMALLOC_MINALIGN - */ - BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || - (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); - - for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { - int elem = size_index_elem(i); - if (elem >= ARRAY_SIZE(size_index)) - break; - size_index[elem] = KMALLOC_SHIFT_LOW; - } - - if (KMALLOC_MIN_SIZE == 64) { - /* - * The 96 byte size cache is not used if the alignment - * is 64 byte. - */ - for (i = 64 + 8; i <= 96; i += 8) - size_index[size_index_elem(i)] = 7; - } else if (KMALLOC_MIN_SIZE == 128) { - /* - * The 192 byte sized cache is not used if the alignment - * is 128 byte. Redirect kmalloc to use the 256 byte cache - * instead. - */ - for (i = 128 + 8; i <= 192; i += 8) - size_index[size_index_elem(i)] = 8; - } - - /* Caches that are not of the two-to-the-power-of size */ - if (KMALLOC_MIN_SIZE <= 32) { - kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0); - caches++; - } - - if (KMALLOC_MIN_SIZE <= 64) { - kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0); - caches++; - } - - for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { - kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0); - caches++; - } - - slab_state = UP; - - /* Provide the correct kmalloc names now that the caches are up */ - if (KMALLOC_MIN_SIZE <= 32) { - kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT); - BUG_ON(!kmalloc_caches[1]->name); - } - - if (KMALLOC_MIN_SIZE <= 64) { - kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT); - BUG_ON(!kmalloc_caches[2]->name); - } - - for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { - char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); - - BUG_ON(!s); - kmalloc_caches[i]->name = s; - } + create_kmalloc_caches(0); #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); #endif -#ifdef CONFIG_ZONE_DMA - for (i = 0; i < SLUB_PAGE_SHIFT; i++) { - struct kmem_cache *s = kmalloc_caches[i]; - - if (s && s->size) { - char *name = kasprintf(GFP_NOWAIT, - "dma-kmalloc-%d", s->object_size); - - BUG_ON(!name); - kmalloc_dma_caches[i] = create_kmalloc_cache(name, - s->object_size, SLAB_CACHE_DMA); - } - } -#endif printk(KERN_INFO - "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," + "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d," " CPUs=%d, Nodes=%d\n", - caches, cache_line_size(), + cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); } @@ -3933,10 +3792,10 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) struct kmem_cache *s; void *ret; - if (unlikely(size > SLUB_MAX_SIZE)) + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) return kmalloc_large(size, gfpflags); - s = get_slab(size, gfpflags); + s = kmalloc_slab(size, gfpflags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; @@ -3956,7 +3815,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, struct kmem_cache *s; void *ret; - if (unlikely(size > SLUB_MAX_SIZE)) { + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { ret = kmalloc_large_node(size, gfpflags, node); trace_kmalloc_node(caller, ret, @@ -3966,7 +3825,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, return ret; } - s = get_slab(size, gfpflags); + s = kmalloc_slab(size, gfpflags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; @@ -4315,7 +4174,7 @@ static void resiliency_test(void) { u8 *p; - BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10); + BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); printk(KERN_ERR "SLUB resiliency testing\n"); printk(KERN_ERR "-----------------------\n"); diff --git a/mm/swap.c b/mm/swap.c index acd40bfffa82..dfd7d71d6841 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -30,6 +30,7 @@ #include <linux/backing-dev.h> #include <linux/memcontrol.h> #include <linux/gfp.h> +#include <linux/uio.h> #include "internal.h" diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b12fd8612604..d365724feb05 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1522,6 +1522,8 @@ static void __vunmap(const void *addr, int deallocate_pages) * Must not be called in NMI context (strictly speaking, only if we don't * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling * conventions for vfree() arch-depenedent would be a really bad idea) + * + * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node) * */ void vfree(const void *addr) |
