summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/backing-dev.c85
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/filemap.c23
-rw-r--r--mm/huge_memory.c6
-rw-r--r--mm/hugetlb.c41
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memblock.c8
-rw-r--r--mm/memcontrol.c141
-rw-r--r--mm/memory-failure.c21
-rw-r--r--mm/memory.c150
-rw-r--r--mm/memory_hotplug.c68
-rw-r--r--mm/mmap.c34
-rw-r--r--mm/nommu.c46
-rw-r--r--mm/oom_kill.c5
-rw-r--r--mm/page-writeback.c291
-rw-r--r--mm/page_alloc.c112
-rw-r--r--mm/page_cgroup.c10
-rw-r--r--mm/pagewalk.c49
-rw-r--r--mm/rmap.c16
-rw-r--r--mm/shmem.c626
-rw-r--r--mm/slab.c17
-rw-r--r--mm/slob.c6
-rw-r--r--mm/slub.c107
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swapfile.c31
-rw-r--r--mm/thrash.c17
-rw-r--r--mm/truncate.c163
-rw-r--r--mm/vmalloc.c18
-rw-r--r--mm/vmscan.c160
30 files changed, 1487 insertions, 772 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 8ca47a5ee9c8..f2f1ca19ed53 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -356,7 +356,7 @@ config CLEANCACHE
for clean pages that the kernel's pageframe replacement algorithm
(PFRA) would like to keep around, but can't since there isn't enough
memory. So when the PFRA "evicts" a page, it first attempts to use
- cleancacne code to put the data contained in that page into
+ cleancache code to put the data contained in that page into
"transcendent memory", memory that is not directly accessible or
addressable by the kernel and is of unknown and possibly
time-varying size. And when a cleancache-enabled
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f032e6e1e09a..d6edf8d14f9c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer;
static int bdi_sync_supers(void *);
static void sync_supers_timer_fn(unsigned long);
+void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+{
+ if (wb1 < wb2) {
+ spin_lock(&wb1->list_lock);
+ spin_lock_nested(&wb2->list_lock, 1);
+ } else {
+ spin_lock(&wb2->list_lock);
+ spin_lock_nested(&wb1->list_lock, 1);
+ }
+}
+
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -67,34 +78,42 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
struct inode *inode;
nr_dirty = nr_io = nr_more_io = 0;
- spin_lock(&inode_wb_list_lock);
+ spin_lock(&wb->list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
list_for_each_entry(inode, &wb->b_io, i_wb_list)
nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++;
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&wb->list_lock);
global_dirty_limits(&background_thresh, &dirty_thresh);
bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
#define K(x) ((x) << (PAGE_SHIFT - 10))
seq_printf(m,
- "BdiWriteback: %8lu kB\n"
- "BdiReclaimable: %8lu kB\n"
- "BdiDirtyThresh: %8lu kB\n"
- "DirtyThresh: %8lu kB\n"
- "BackgroundThresh: %8lu kB\n"
- "b_dirty: %8lu\n"
- "b_io: %8lu\n"
- "b_more_io: %8lu\n"
- "bdi_list: %8u\n"
- "state: %8lx\n",
+ "BdiWriteback: %10lu kB\n"
+ "BdiReclaimable: %10lu kB\n"
+ "BdiDirtyThresh: %10lu kB\n"
+ "DirtyThresh: %10lu kB\n"
+ "BackgroundThresh: %10lu kB\n"
+ "BdiWritten: %10lu kB\n"
+ "BdiWriteBandwidth: %10lu kBps\n"
+ "b_dirty: %10lu\n"
+ "b_io: %10lu\n"
+ "b_more_io: %10lu\n"
+ "bdi_list: %10u\n"
+ "state: %10lx\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
(unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
- K(bdi_thresh), K(dirty_thresh),
- K(background_thresh), nr_dirty, nr_io, nr_more_io,
+ K(bdi_thresh),
+ K(dirty_thresh),
+ K(background_thresh),
+ (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+ (unsigned long) K(bdi->write_bandwidth),
+ nr_dirty,
+ nr_io,
+ nr_more_io,
!list_empty(&bdi->bdi_list), bdi->state);
#undef K
@@ -249,18 +268,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
return wb_has_dirty_io(&bdi->wb);
}
-static void bdi_flush_io(struct backing_dev_info *bdi)
-{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = NULL,
- .range_cyclic = 1,
- .nr_to_write = 1024,
- };
-
- writeback_inodes_wb(&bdi->wb, &wbc);
-}
-
/*
* kupdated() used to do this. We cannot do it from the bdi_forker_thread()
* or we risk deadlocking on ->s_umount. The longer term solution would be
@@ -446,9 +453,10 @@ static int bdi_forker_thread(void *ptr)
if (IS_ERR(task)) {
/*
* If thread creation fails, force writeout of
- * the bdi from the thread.
+ * the bdi from the thread. Hopefully 1024 is
+ * large enough for efficient IO.
*/
- bdi_flush_io(bdi);
+ writeback_inodes_wb(&bdi->wb, 1024);
} else {
/*
* The spinlock makes sure we do not lose
@@ -505,7 +513,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
list_del_rcu(&bdi->bdi_list);
spin_unlock_bh(&bdi_lock);
- synchronize_rcu();
+ synchronize_rcu_expedited();
}
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -606,6 +614,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
if (bdi->dev) {
+ bdi_set_min_ratio(bdi, 0);
trace_writeback_bdi_unregister(bdi);
bdi_prune_sb(bdi);
del_timer_sync(&bdi->wb.wakeup_timer);
@@ -628,9 +637,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
INIT_LIST_HEAD(&wb->b_dirty);
INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io);
+ spin_lock_init(&wb->list_lock);
setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
}
+/*
+ * Initial write bandwidth: 100 MB/s
+ */
+#define INIT_BW (100 << (20 - PAGE_SHIFT))
+
int bdi_init(struct backing_dev_info *bdi)
{
int i, err;
@@ -653,6 +668,13 @@ int bdi_init(struct backing_dev_info *bdi)
}
bdi->dirty_exceeded = 0;
+
+ bdi->bw_time_stamp = jiffies;
+ bdi->written_stamp = 0;
+
+ bdi->write_bandwidth = INIT_BW;
+ bdi->avg_write_bandwidth = INIT_BW;
+
err = prop_local_init_percpu(&bdi->completions);
if (err) {
@@ -676,11 +698,12 @@ void bdi_destroy(struct backing_dev_info *bdi)
if (bdi_has_dirty_io(bdi)) {
struct bdi_writeback *dst = &default_backing_dev_info.wb;
- spin_lock(&inode_wb_list_lock);
+ bdi_lock_two(&bdi->wb, dst);
list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
list_splice(&bdi->wb.b_io, &dst->b_io);
list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&bdi->wb.list_lock);
+ spin_unlock(&dst->list_lock);
}
bdi_unregister(bdi);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 03bf3bb4519a..fbb58e346888 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -500,7 +500,7 @@ void dmam_pool_destroy(struct dma_pool *pool)
{
struct device *dev = pool->dev;
- dma_pool_destroy(pool);
WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
+ dma_pool_destroy(pool);
}
EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/filemap.c b/mm/filemap.c
index a8251a8d3457..867d40222ec7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -78,10 +78,7 @@
* ->i_mutex (generic_file_buffered_write)
* ->mmap_sem (fault_in_pages_readable->do_page_fault)
*
- * ->i_mutex
- * ->i_alloc_sem (various)
- *
- * inode_wb_list_lock
+ * bdi->wb.list_lock
* sb_lock (fs/fs-writeback.c)
* ->mapping->tree_lock (__sync_single_inode)
*
@@ -99,9 +96,9 @@
* ->zone.lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
- * inode_wb_list_lock (page_remove_rmap->set_page_dirty)
+ * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
- * inode_wb_list_lock (zap_pte_range->set_page_dirty)
+ * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
@@ -131,6 +128,7 @@ void __delete_from_page_cache(struct page *page)
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
+ /* Leave page->index set: truncation lookup relies upon it */
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
@@ -486,6 +484,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
+ /* Leave page->index set: truncation relies upon it */
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
page_cache_release(page);
@@ -1795,7 +1794,7 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
static struct page *__read_cache_page(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *,struct page*),
+ int (*filler)(void *, struct page *),
void *data,
gfp_t gfp)
{
@@ -1826,7 +1825,7 @@ repeat:
static struct page *do_read_cache_page(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *,struct page*),
+ int (*filler)(void *, struct page *),
void *data,
gfp_t gfp)
@@ -1866,7 +1865,7 @@ out:
* @mapping: the page's address_space
* @index: the page index
* @filler: function to perform the read
- * @data: destination for read data
+ * @data: first arg to filler(data, page) function, often left as NULL
*
* Same as read_cache_page, but don't wait for page to become unlocked
* after submitting it to the filler.
@@ -1878,7 +1877,7 @@ out:
*/
struct page *read_cache_page_async(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *,struct page*),
+ int (*filler)(void *, struct page *),
void *data)
{
return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
@@ -1926,7 +1925,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
* @mapping: the page's address_space
* @index: the page index
* @filler: function to perform the read
- * @data: destination for read data
+ * @data: first arg to filler(data, page) function, often left as NULL
*
* Read into the page cache. If a page already exists, and PageUptodate() is
* not set, try to fill the page then wait for it to become unlocked.
@@ -1935,7 +1934,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
*/
struct page *read_cache_page(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *,struct page*),
+ int (*filler)(void *, struct page *),
void *data)
{
return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 81532f297fd2..e2d1587be269 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1596,14 +1596,13 @@ void __khugepaged_exit(struct mm_struct *mm)
list_del(&mm_slot->mm_node);
free = 1;
}
+ spin_unlock(&khugepaged_mm_lock);
if (free) {
- spin_unlock(&khugepaged_mm_lock);
clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
free_mm_slot(mm_slot);
mmdrop(mm);
} else if (mm_slot) {
- spin_unlock(&khugepaged_mm_lock);
/*
* This is required to serialize against
* khugepaged_test_exit() (which is guaranteed to run
@@ -1614,8 +1613,7 @@ void __khugepaged_exit(struct mm_struct *mm)
*/
down_write(&mm->mmap_sem);
up_write(&mm->mmap_sem);
- } else
- spin_unlock(&khugepaged_mm_lock);
+ }
}
static void release_pte_page(struct page *page)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bfcf153bc829..dae27ba3be2c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,7 +24,7 @@
#include <asm/page.h>
#include <asm/pgtable.h>
-#include <asm/io.h>
+#include <linux/io.h>
#include <linux/hugetlb.h>
#include <linux/node.h>
@@ -62,10 +62,10 @@ static DEFINE_SPINLOCK(hugetlb_lock);
* must either hold the mmap_sem for write, or the mmap_sem for read and
* the hugetlb_instantiation mutex:
*
- * down_write(&mm->mmap_sem);
+ * down_write(&mm->mmap_sem);
* or
- * down_read(&mm->mmap_sem);
- * mutex_lock(&hugetlb_instantiation_mutex);
+ * down_read(&mm->mmap_sem);
+ * mutex_lock(&hugetlb_instantiation_mutex);
*/
struct file_region {
struct list_head link;
@@ -503,9 +503,10 @@ static void update_and_free_page(struct hstate *h, struct page *page)
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
for (i = 0; i < pages_per_huge_page(h); i++) {
- page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
- 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
- 1 << PG_private | 1<< PG_writeback);
+ page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
+ 1 << PG_referenced | 1 << PG_dirty |
+ 1 << PG_active | 1 << PG_reserved |
+ 1 << PG_private | 1 << PG_writeback);
}
set_compound_page_dtor(page, NULL);
set_page_refcounted(page);
@@ -591,7 +592,6 @@ int PageHuge(struct page *page)
return dtor == free_huge_page;
}
-
EXPORT_SYMBOL_GPL(PageHuge);
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
@@ -1105,8 +1105,16 @@ static void __init gather_bootmem_prealloc(void)
struct huge_bootmem_page *m;
list_for_each_entry(m, &huge_boot_pages, list) {
- struct page *page = virt_to_page(m);
struct hstate *h = m->hstate;
+ struct page *page;
+
+#ifdef CONFIG_HIGHMEM
+ page = pfn_to_page(m->phys >> PAGE_SHIFT);
+ free_bootmem_late((unsigned long)m,
+ sizeof(struct huge_bootmem_page));
+#else
+ page = virt_to_page(m);
+#endif
__ClearPageReserved(page);
WARN_ON(page_count(page) != 1);
prep_compound_huge_page(page, h->order);
@@ -2124,9 +2132,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
pte_t entry;
entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
- if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
+ if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
update_mmu_cache(vma, address, ptep);
- }
}
@@ -2181,9 +2188,9 @@ static int is_hugetlb_entry_migration(pte_t pte)
if (huge_pte_none(pte) || pte_present(pte))
return 0;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_migration_entry(swp)) {
+ if (non_swap_entry(swp) && is_migration_entry(swp))
return 1;
- } else
+ else
return 0;
}
@@ -2194,9 +2201,9 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
if (huge_pte_none(pte) || pte_present(pte))
return 0;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
+ if (non_swap_entry(swp) && is_hwpoison_entry(swp))
return 1;
- } else
+ else
return 0;
}
@@ -2559,7 +2566,7 @@ retry:
* So we need to block hugepage fault by PG_hwpoison bit check.
*/
if (unlikely(PageHWPoison(page))) {
- ret = VM_FAULT_HWPOISON |
+ ret = VM_FAULT_HWPOISON |
VM_FAULT_SET_HINDEX(h - hstates);
goto backout_unlocked;
}
@@ -2627,7 +2634,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
migration_entry_wait(mm, (pmd_t *)ptep, address);
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
- return VM_FAULT_HWPOISON_LARGE |
+ return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(h - hstates);
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 2221491ed503..74bf193eff04 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma,
endoff = (loff_t)(end - vma->vm_start - 1)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- /* vmtruncate_range needs to take i_mutex and i_alloc_sem */
+ /* vmtruncate_range needs to take i_mutex */
up_read(&current->mm->mmap_sem);
error = vmtruncate_range(mapping->host, offset, endoff);
down_read(&current->mm->mmap_sem);
diff --git a/mm/memblock.c b/mm/memblock.c
index a0562d1a6ad4..ccbf97339592 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -758,9 +758,9 @@ void __init memblock_analyze(void)
/* Check marker in the unused last array entry */
WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
- != (phys_addr_t)RED_INACTIVE);
+ != MEMBLOCK_INACTIVE);
WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
- != (phys_addr_t)RED_INACTIVE);
+ != MEMBLOCK_INACTIVE);
memblock.memory_size = 0;
@@ -786,8 +786,8 @@ void __init memblock_init(void)
memblock.reserved.max = INIT_MEMBLOCK_REGIONS;
/* Write a marker in the unused last array entry */
- memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
- memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+ memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
+ memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
/* Create a dummy zero size MEMBLOCK which will get coalesced away later.
* This simplifies the memblock_add() code below...
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cf7d027a8844..e013b8e57d25 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,6 +35,7 @@
#include <linux/limits.h>
#include <linux/mutex.h>
#include <linux/rbtree.h>
+#include <linux/shmem_fs.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/swapops.h>
@@ -107,10 +108,12 @@ enum mem_cgroup_events_index {
enum mem_cgroup_events_target {
MEM_CGROUP_TARGET_THRESH,
MEM_CGROUP_TARGET_SOFTLIMIT,
+ MEM_CGROUP_TARGET_NUMAINFO,
MEM_CGROUP_NTARGETS,
};
#define THRESHOLDS_EVENTS_TARGET (128)
#define SOFTLIMIT_EVENTS_TARGET (1024)
+#define NUMAINFO_EVENTS_TARGET (1024)
struct mem_cgroup_stat_cpu {
long count[MEM_CGROUP_STAT_NSTATS];
@@ -236,7 +239,8 @@ struct mem_cgroup {
int last_scanned_node;
#if MAX_NUMNODES > 1
nodemask_t scan_nodes;
- unsigned long next_scan_node_update;
+ atomic_t numainfo_events;
+ atomic_t numainfo_updating;
#endif
/*
* Should the accounting and control be hierarchical, per subtree?
@@ -576,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
return val;
}
-static long mem_cgroup_local_usage(struct mem_cgroup *mem)
-{
- long ret;
-
- ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
- ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
- return ret;
-}
-
static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
bool charge)
{
@@ -688,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
case MEM_CGROUP_TARGET_SOFTLIMIT:
next = val + SOFTLIMIT_EVENTS_TARGET;
break;
+ case MEM_CGROUP_TARGET_NUMAINFO:
+ next = val + NUMAINFO_EVENTS_TARGET;
+ break;
default:
return;
}
@@ -706,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
mem_cgroup_threshold(mem);
__mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
if (unlikely(__memcg_event_check(mem,
- MEM_CGROUP_TARGET_SOFTLIMIT))){
+ MEM_CGROUP_TARGET_SOFTLIMIT))) {
mem_cgroup_update_tree(mem, page);
__mem_cgroup_target_update(mem,
- MEM_CGROUP_TARGET_SOFTLIMIT);
+ MEM_CGROUP_TARGET_SOFTLIMIT);
+ }
+#if MAX_NUMNODES > 1
+ if (unlikely(__memcg_event_check(mem,
+ MEM_CGROUP_TARGET_NUMAINFO))) {
+ atomic_inc(&mem->numainfo_events);
+ __mem_cgroup_target_update(mem,
+ MEM_CGROUP_TARGET_NUMAINFO);
}
+#endif
}
}
@@ -1128,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
return MEM_CGROUP_ZSTAT(mz, lru);
}
-#ifdef CONFIG_NUMA
static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
int nid)
{
@@ -1140,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
return ret;
}
+static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
+ int nid)
+{
+ unsigned long ret;
+
+ ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
+ mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
+ return ret;
+}
+
+#if MAX_NUMNODES > 1
static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
{
u64 total = 0;
@@ -1151,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
return total;
}
-static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
- int nid)
-{
- unsigned long ret;
-
- ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
- mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
-
- return ret;
-}
-
static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
{
u64 total = 0;
@@ -1558,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
return ret;
}
+/**
+ * test_mem_cgroup_node_reclaimable
+ * @mem: the target memcg
+ * @nid: the node ID to be checked.
+ * @noswap : specify true here if the user wants flle only information.
+ *
+ * This function returns whether the specified memcg contains any
+ * reclaimable pages on a node. Returns true if there are any reclaimable
+ * pages in the node.
+ */
+static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
+ int nid, bool noswap)
+{
+ if (mem_cgroup_node_nr_file_lru_pages(mem, nid))
+ return true;
+ if (noswap || !total_swap_pages)
+ return false;
+ if (mem_cgroup_node_nr_anon_lru_pages(mem, nid))
+ return true;
+ return false;
+
+}
#if MAX_NUMNODES > 1
/*
@@ -1569,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
{
int nid;
-
- if (time_after(mem->next_scan_node_update, jiffies))
+ /*
+ * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
+ * pagein/pageout changes since the last update.
+ */
+ if (!atomic_read(&mem->numainfo_events))
+ return;
+ if (atomic_inc_return(&mem->numainfo_updating) > 1)
return;
- mem->next_scan_node_update = jiffies + 10*HZ;
/* make a nodemask where this memcg uses memory from */
mem->scan_nodes = node_states[N_HIGH_MEMORY];
for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
- if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
- mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
- continue;
-
- if (total_swap_pages &&
- (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
- mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
- continue;
- node_clear(nid, mem->scan_nodes);
+ if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
+ node_clear(nid, mem->scan_nodes);
}
+
+ atomic_set(&mem->numainfo_events, 0);
+ atomic_set(&mem->numainfo_updating, 0);
}
/*
@@ -1626,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
return node;
}
+/*
+ * Check all nodes whether it contains reclaimable pages or not.
+ * For quick scan, we make use of scan_nodes. This will allow us to skip
+ * unused nodes. But scan_nodes is lazily updated and may not cotain
+ * enough new information. We need to do double check.
+ */
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+ int nid;
+
+ /*
+ * quick check...making use of scan_node.
+ * We can skip unused nodes.
+ */
+ if (!nodes_empty(mem->scan_nodes)) {
+ for (nid = first_node(mem->scan_nodes);
+ nid < MAX_NUMNODES;
+ nid = next_node(nid, mem->scan_nodes)) {
+
+ if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+ return true;
+ }
+ }
+ /*
+ * Check rest of nodes.
+ */
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ if (node_isset(nid, mem->scan_nodes))
+ continue;
+ if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+ return true;
+ }
+ return false;
+}
+
#else
int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
{
return 0;
}
+
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+ return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
+}
#endif
/*
@@ -1701,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
}
}
}
- if (!mem_cgroup_local_usage(victim)) {
+ if (!mem_cgroup_reclaimable(victim, noswap)) {
/* this cgroup's local usage == 0 */
css_put(&victim->css);
continue;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index eac0ba561491..740c4f52059c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -391,10 +391,11 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
struct task_struct *tsk;
struct anon_vma *av;
- read_lock(&tasklist_lock);
av = page_lock_anon_vma(page);
if (av == NULL) /* Not actually mapped anymore */
- goto out;
+ return;
+
+ read_lock(&tasklist_lock);
for_each_process (tsk) {
struct anon_vma_chain *vmac;
@@ -408,9 +409,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
add_to_kill(tsk, page, vma, to_kill, tkc);
}
}
- page_unlock_anon_vma(av);
-out:
read_unlock(&tasklist_lock);
+ page_unlock_anon_vma(av);
}
/*
@@ -424,17 +424,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
struct prio_tree_iter iter;
struct address_space *mapping = page->mapping;
- /*
- * A note on the locking order between the two locks.
- * We don't rely on this particular order.
- * If you have some other code that needs a different order
- * feel free to switch them around. Or add a reverse link
- * from mm_struct to task_struct, then this could be all
- * done without taking tasklist_lock and looping over all tasks.
- */
-
- read_lock(&tasklist_lock);
mutex_lock(&mapping->i_mmap_mutex);
+ read_lock(&tasklist_lock);
for_each_process(tsk) {
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -454,8 +445,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
add_to_kill(tsk, page, vma, to_kill, tkc);
}
}
- mutex_unlock(&mapping->i_mmap_mutex);
read_unlock(&tasklist_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
/*
diff --git a/mm/memory.c b/mm/memory.c
index 87d935333f0d..a56e3ba816b2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -305,6 +305,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
if (batch->nr == batch->max) {
if (!tlb_next_batch(tlb))
return 0;
+ batch = tlb->active;
}
VM_BUG_ON(batch->nr > batch->max);
@@ -1289,13 +1290,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
return addr;
}
-#ifdef CONFIG_PREEMPT
-# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
-#else
-/* No preempt: go for improved straight-line efficiency */
-# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
-#endif
-
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlb: address of the caller's struct mmu_gather
@@ -1309,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
*
* Unmap all pages in the vma list.
*
- * We aim to not hold locks for too long (for scheduling latency reasons).
- * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
- * return the ending mmu_gather to the caller.
- *
* Only addresses between `start' and `end' will be unmapped.
*
* The VMA list must be sorted in ascending virtual address order.
@@ -1815,7 +1805,63 @@ next_page:
}
EXPORT_SYMBOL(__get_user_pages);
-/**
+/*
+ * fixup_user_fault() - manually resolve a user page fault
+ * @tsk: the task_struct to use for page fault accounting, or
+ * NULL if faults are not to be recorded.
+ * @mm: mm_struct of target mm
+ * @address: user address
+ * @fault_flags:flags to pass down to handle_mm_fault()
+ *
+ * This is meant to be called in the specific scenario where for locking reasons
+ * we try to access user memory in atomic context (within a pagefault_disable()
+ * section), this returns -EFAULT, and we want to resolve the user fault before
+ * trying again.
+ *
+ * Typically this is meant to be used by the futex code.
+ *
+ * The main difference with get_user_pages() is that this function will
+ * unconditionally call handle_mm_fault() which will in turn perform all the
+ * necessary SW fixup of the dirty and young bits in the PTE, while
+ * handle_mm_fault() only guarantees to update these in the struct page.
+ *
+ * This is important for some architectures where those bits also gate the
+ * access permission to the page because they are maintained in software. On
+ * such architectures, gup() will not be enough to make a subsequent access
+ * succeed.
+ *
+ * This should be called with the mm_sem held for read.
+ */
+int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long address, unsigned int fault_flags)
+{
+ struct vm_area_struct *vma;
+ int ret;
+
+ vma = find_extend_vma(mm, address);
+ if (!vma || address < vma->vm_start)
+ return -EFAULT;
+
+ ret = handle_mm_fault(mm, vma, address, fault_flags);
+ if (ret & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_OOM)
+ return -ENOMEM;
+ if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
+ return -EHWPOISON;
+ if (ret & VM_FAULT_SIGBUS)
+ return -EFAULT;
+ BUG();
+ }
+ if (tsk) {
+ if (ret & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+ }
+ return 0;
+}
+
+/*
* get_user_pages() - pin user pages in memory
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
@@ -2798,30 +2844,6 @@ void unmap_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL(unmap_mapping_range);
-int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
-{
- struct address_space *mapping = inode->i_mapping;
-
- /*
- * If the underlying filesystem is not going to provide
- * a way to truncate a range of blocks (punch a hole) -
- * we should return failure right now.
- */
- if (!inode->i_op->truncate_range)
- return -ENOSYS;
-
- mutex_lock(&inode->i_mutex);
- down_write(&inode->i_alloc_sem);
- unmap_mapping_range(mapping, offset, (end - offset), 1);
- truncate_inode_pages_range(mapping, offset, end);
- unmap_mapping_range(mapping, offset, (end - offset), 1);
- inode->i_op->truncate_range(inode, offset, end);
- up_write(&inode->i_alloc_sem);
- mutex_unlock(&inode->i_mutex);
-
- return 0;
-}
-
/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -3127,14 +3149,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *page_table;
spinlock_t *ptl;
struct page *page;
+ struct page *cow_page;
pte_t entry;
int anon = 0;
- int charged = 0;
struct page *dirty_page = NULL;
struct vm_fault vmf;
int ret;
int page_mkwrite = 0;
+ /*
+ * If we do COW later, allocate page befor taking lock_page()
+ * on the file cache page. This will reduce lock holding time.
+ */
+ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+
+ cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!cow_page)
+ return VM_FAULT_OOM;
+
+ if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
+ page_cache_release(cow_page);
+ return VM_FAULT_OOM;
+ }
+ } else
+ cow_page = NULL;
+
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
vmf.pgoff = pgoff;
vmf.flags = flags;
@@ -3143,12 +3185,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ret = vma->vm_ops->fault(vma, &vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY)))
- return ret;
+ goto uncharge_out;
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
unlock_page(vmf.page);
- return VM_FAULT_HWPOISON;
+ ret = VM_FAULT_HWPOISON;
+ goto uncharge_out;
}
/*
@@ -3166,23 +3209,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
page = vmf.page;
if (flags & FAULT_FLAG_WRITE) {
if (!(vma->vm_flags & VM_SHARED)) {
+ page = cow_page;
anon = 1;
- if (unlikely(anon_vma_prepare(vma))) {
- ret = VM_FAULT_OOM;
- goto out;
- }
- page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
- vma, address);
- if (!page) {
- ret = VM_FAULT_OOM;
- goto out;
- }
- if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
- ret = VM_FAULT_OOM;
- page_cache_release(page);
- goto out;
- }
- charged = 1;
copy_user_highpage(page, vmf.page, address, vma);
__SetPageUptodate(page);
} else {
@@ -3251,8 +3279,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, address, page_table);
} else {
- if (charged)
- mem_cgroup_uncharge_page(page);
+ if (cow_page)
+ mem_cgroup_uncharge_page(cow_page);
if (anon)
page_cache_release(page);
else
@@ -3261,7 +3289,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(page_table, ptl);
-out:
if (dirty_page) {
struct address_space *mapping = page->mapping;
@@ -3291,6 +3318,13 @@ out:
unwritable_page:
page_cache_release(page);
return ret;
+uncharge_out:
+ /* fs's fault handler get error */
+ if (cow_page) {
+ mem_cgroup_uncharge_page(cow_page);
+ page_cache_release(cow_page);
+ }
+ return ret;
}
static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c46887b5a11e..6e7d8b21dbfa 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,17 @@
#include "internal.h"
+/*
+ * online_page_callback contains pointer to current page onlining function.
+ * Initially it is generic_online_page(). If it is required it could be
+ * changed by calling set_online_page_callback() for callback registration
+ * and restore_online_page_callback() for generic callback restore.
+ */
+
+static void generic_online_page(struct page *page);
+
+static online_page_callback_t online_page_callback = generic_online_page;
+
DEFINE_MUTEX(mem_hotplug_mutex);
void lock_memory_hotplug(void)
@@ -361,23 +372,74 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
}
EXPORT_SYMBOL_GPL(__remove_pages);
-void online_page(struct page *page)
+int set_online_page_callback(online_page_callback_t callback)
+{
+ int rc = -EINVAL;
+
+ lock_memory_hotplug();
+
+ if (online_page_callback == generic_online_page) {
+ online_page_callback = callback;
+ rc = 0;
+ }
+
+ unlock_memory_hotplug();
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(set_online_page_callback);
+
+int restore_online_page_callback(online_page_callback_t callback)
+{
+ int rc = -EINVAL;
+
+ lock_memory_hotplug();
+
+ if (online_page_callback == callback) {
+ online_page_callback = generic_online_page;
+ rc = 0;
+ }
+
+ unlock_memory_hotplug();
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(restore_online_page_callback);
+
+void __online_page_set_limits(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
- totalram_pages++;
if (pfn >= num_physpages)
num_physpages = pfn + 1;
+}
+EXPORT_SYMBOL_GPL(__online_page_set_limits);
+
+void __online_page_increment_counters(struct page *page)
+{
+ totalram_pages++;
#ifdef CONFIG_HIGHMEM
if (PageHighMem(page))
totalhigh_pages++;
#endif
+}
+EXPORT_SYMBOL_GPL(__online_page_increment_counters);
+void __online_page_free(struct page *page)
+{
ClearPageReserved(page);
init_page_count(page);
__free_page(page);
}
+EXPORT_SYMBOL_GPL(__online_page_free);
+
+static void generic_online_page(struct page *page)
+{
+ __online_page_set_limits(page);
+ __online_page_increment_counters(page);
+ __online_page_free(page);
+}
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg)
@@ -388,7 +450,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
if (PageReserved(pfn_to_page(start_pfn)))
for (i = 0; i < nr_pages; i++) {
page = pfn_to_page(start_pfn + i);
- online_page(page);
+ (*online_page_callback)(page);
onlined_pages++;
}
*(unsigned long *)arg = onlined_pages;
diff --git a/mm/mmap.c b/mm/mmap.c
index d49736ff8a8d..a65efd4db3e1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -122,9 +122,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- unsigned long n;
+ free = global_page_state(NR_FREE_PAGES);
+ free += global_page_state(NR_FILE_PAGES);
+
+ /*
+ * shmem pages shouldn't be counted as free in this
+ * case, they can't be purged, only swapped out, and
+ * that won't affect the overall amount of available
+ * memory in the system.
+ */
+ free -= global_page_state(NR_SHMEM);
- free = global_page_state(NR_FILE_PAGES);
free += nr_swap_pages;
/*
@@ -136,34 +144,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free += global_page_state(NR_SLAB_RECLAIMABLE);
/*
- * Leave the last 3% for root
- */
- if (!cap_sys_admin)
- free -= free / 32;
-
- if (free > pages)
- return 0;
-
- /*
- * nr_free_pages() is very expensive on large systems,
- * only call if we're about to fail.
- */
- n = nr_free_pages();
-
- /*
* Leave reserved pages. The pages are not for anonymous pages.
*/
- if (n <= totalreserve_pages)
+ if (free <= totalreserve_pages)
goto error;
else
- n -= totalreserve_pages;
+ free -= totalreserve_pages;
/*
* Leave the last 3% for root
*/
if (!cap_sys_admin)
- n -= n / 32;
- free += n;
+ free -= free / 32;
if (free > pages)
return 0;
diff --git a/mm/nommu.c b/mm/nommu.c
index 1fd0c51b10a6..4358032566e9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,6 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
-#include <linux/tracehook.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/mount.h>
@@ -1087,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file,
* it's being traced - otherwise breakpoints set in it may interfere
* with another untraced process
*/
- if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
+ if ((flags & MAP_PRIVATE) && current->ptrace)
vm_flags &= ~VM_MAYSHARE;
return vm_flags;
@@ -1813,10 +1812,13 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
return NULL;
}
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
- unsigned long to, unsigned long size, pgprot_t prot)
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
{
- vma->vm_start = vma->vm_pgoff << PAGE_SHIFT;
+ if (addr != (pfn << PAGE_SHIFT))
+ return -EINVAL;
+
+ vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
return 0;
}
EXPORT_SYMBOL(remap_pfn_range);
@@ -1882,9 +1884,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- unsigned long n;
+ free = global_page_state(NR_FREE_PAGES);
+ free += global_page_state(NR_FILE_PAGES);
+
+ /*
+ * shmem pages shouldn't be counted as free in this
+ * case, they can't be purged, only swapped out, and
+ * that won't affect the overall amount of available
+ * memory in the system.
+ */
+ free -= global_page_state(NR_SHMEM);
- free = global_page_state(NR_FILE_PAGES);
free += nr_swap_pages;
/*
@@ -1896,34 +1906,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free += global_page_state(NR_SLAB_RECLAIMABLE);
/*
- * Leave the last 3% for root
- */
- if (!cap_sys_admin)
- free -= free / 32;
-
- if (free > pages)
- return 0;
-
- /*
- * nr_free_pages() is very expensive on large systems,
- * only call if we're about to fail.
- */
- n = nr_free_pages();
-
- /*
* Leave reserved pages. The pages are not for anonymous pages.
*/
- if (n <= totalreserve_pages)
+ if (free <= totalreserve_pages)
goto error;
else
- n -= totalreserve_pages;
+ free -= totalreserve_pages;
/*
* Leave the last 3% for root
*/
if (!cap_sys_admin)
- n -= n / 32;
- free += n;
+ free -= free / 32;
if (free > pages)
return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e4b0991ca351..eafff89b3dd6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -339,8 +339,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
* then wait for it to finish before killing
* some other task unnecessarily.
*/
- if (!(task_ptrace(p->group_leader) &
- PT_TRACE_EXIT))
+ if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
return ERR_PTR(-1UL);
}
}
@@ -488,7 +487,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* If any of p's children has a different mm and is eligible for kill,
- * the one with the highest badness() score is sacrificed for its
+ * the one with the highest oom_badness() score is sacrificed for its
* parent. This attempts to lose the minimal amount of work done while
* still freeing memory.
*/
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 31f698862420..d1960744f881 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -37,6 +37,16 @@
#include <trace/events/writeback.h>
/*
+ * Sleep at most 200ms at a time in balance_dirty_pages().
+ */
+#define MAX_PAUSE max(HZ/5, 1)
+
+/*
+ * Estimate write bandwidth at 200ms intervals.
+ */
+#define BANDWIDTH_INTERVAL max(HZ/5, 1)
+
+/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
* will look to see if it needs to force writeback or throttling.
*/
@@ -111,6 +121,7 @@ EXPORT_SYMBOL(laptop_mode);
/* End of sysctl-exported parameters */
+unsigned long global_dirty_limit;
/*
* Scale the writeback cache size proportional to the relative writeout speeds.
@@ -219,6 +230,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
*/
static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
{
+ __inc_bdi_stat(bdi, BDI_WRITTEN);
__prop_inc_percpu_max(&vm_completions, &bdi->completions,
bdi->max_prop_frac);
}
@@ -244,13 +256,8 @@ void task_dirty_inc(struct task_struct *tsk)
static void bdi_writeout_fraction(struct backing_dev_info *bdi,
long *numerator, long *denominator)
{
- if (bdi_cap_writeback_dirty(bdi)) {
- prop_fraction_percpu(&vm_completions, &bdi->completions,
+ prop_fraction_percpu(&vm_completions, &bdi->completions,
numerator, denominator);
- } else {
- *numerator = 0;
- *denominator = 1;
- }
}
static inline void task_dirties_fraction(struct task_struct *tsk,
@@ -274,12 +281,13 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
* effectively curb the growth of dirty pages. Light dirtiers with high enough
* dirty threshold may never get throttled.
*/
+#define TASK_LIMIT_FRACTION 8
static unsigned long task_dirty_limit(struct task_struct *tsk,
unsigned long bdi_dirty)
{
long numerator, denominator;
unsigned long dirty = bdi_dirty;
- u64 inv = dirty >> 3;
+ u64 inv = dirty / TASK_LIMIT_FRACTION;
task_dirties_fraction(tsk, &numerator, &denominator);
inv *= numerator;
@@ -290,6 +298,12 @@ static unsigned long task_dirty_limit(struct task_struct *tsk,
return max(dirty, bdi_dirty/2);
}
+/* Minimum limit for any task */
+static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
+{
+ return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
+}
+
/*
*
*/
@@ -397,6 +411,11 @@ unsigned long determine_dirtyable_memory(void)
return x + 1; /* Ensure that we never return 0 */
}
+static unsigned long hard_dirty_limit(unsigned long thresh)
+{
+ return max(thresh, global_dirty_limit);
+}
+
/*
* global_dirty_limits - background-writeback and dirty-throttling thresholds
*
@@ -435,12 +454,20 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
}
*pbackground = background;
*pdirty = dirty;
+ trace_global_dirty_state(background, dirty);
}
-/*
+/**
* bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ * @bdi: the backing_dev_info to query
+ * @dirty: global dirty limit in pages
+ *
+ * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+ * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+ * And the "limit" in the name is not seriously taken as hard limit in
+ * balance_dirty_pages().
*
- * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * It allocates high/low dirty limits to fast/slow devices, in order to prevent
* - starving fast devices
* - piling up dirty pages (that will take long time to sync) on slow devices
*
@@ -468,6 +495,153 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
return bdi_dirty;
}
+static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
+ unsigned long elapsed,
+ unsigned long written)
+{
+ const unsigned long period = roundup_pow_of_two(3 * HZ);
+ unsigned long avg = bdi->avg_write_bandwidth;
+ unsigned long old = bdi->write_bandwidth;
+ u64 bw;
+
+ /*
+ * bw = written * HZ / elapsed
+ *
+ * bw * elapsed + write_bandwidth * (period - elapsed)
+ * write_bandwidth = ---------------------------------------------------
+ * period
+ */
+ bw = written - bdi->written_stamp;
+ bw *= HZ;
+ if (unlikely(elapsed > period)) {
+ do_div(bw, elapsed);
+ avg = bw;
+ goto out;
+ }
+ bw += (u64)bdi->write_bandwidth * (period - elapsed);
+ bw >>= ilog2(period);
+
+ /*
+ * one more level of smoothing, for filtering out sudden spikes
+ */
+ if (avg > old && old >= (unsigned long)bw)
+ avg -= (avg - old) >> 3;
+
+ if (avg < old && old <= (unsigned long)bw)
+ avg += (old - avg) >> 3;
+
+out:
+ bdi->write_bandwidth = bw;
+ bdi->avg_write_bandwidth = avg;
+}
+
+/*
+ * The global dirtyable memory and dirty threshold could be suddenly knocked
+ * down by a large amount (eg. on the startup of KVM in a swapless system).
+ * This may throw the system into deep dirty exceeded state and throttle
+ * heavy/light dirtiers alike. To retain good responsiveness, maintain
+ * global_dirty_limit for tracking slowly down to the knocked down dirty
+ * threshold.
+ */
+static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+{
+ unsigned long limit = global_dirty_limit;
+
+ /*
+ * Follow up in one step.
+ */
+ if (limit < thresh) {
+ limit = thresh;
+ goto update;
+ }
+
+ /*
+ * Follow down slowly. Use the higher one as the target, because thresh
+ * may drop below dirty. This is exactly the reason to introduce
+ * global_dirty_limit which is guaranteed to lie above the dirty pages.
+ */
+ thresh = max(thresh, dirty);
+ if (limit > thresh) {
+ limit -= (limit - thresh) >> 5;
+ goto update;
+ }
+ return;
+update:
+ global_dirty_limit = limit;
+}
+
+static void global_update_bandwidth(unsigned long thresh,
+ unsigned long dirty,
+ unsigned long now)
+{
+ static DEFINE_SPINLOCK(dirty_lock);
+ static unsigned long update_time;
+
+ /*
+ * check locklessly first to optimize away locking for the most time
+ */
+ if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+ return;
+
+ spin_lock(&dirty_lock);
+ if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
+ update_dirty_limit(thresh, dirty);
+ update_time = now;
+ }
+ spin_unlock(&dirty_lock);
+}
+
+void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+ unsigned long thresh,
+ unsigned long dirty,
+ unsigned long bdi_thresh,
+ unsigned long bdi_dirty,
+ unsigned long start_time)
+{
+ unsigned long now = jiffies;
+ unsigned long elapsed = now - bdi->bw_time_stamp;
+ unsigned long written;
+
+ /*
+ * rate-limit, only update once every 200ms.
+ */
+ if (elapsed < BANDWIDTH_INTERVAL)
+ return;
+
+ written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+
+ /*
+ * Skip quiet periods when disk bandwidth is under-utilized.
+ * (at least 1s idle time between two flusher runs)
+ */
+ if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+ goto snapshot;
+
+ if (thresh)
+ global_update_bandwidth(thresh, dirty, now);
+
+ bdi_update_write_bandwidth(bdi, elapsed, written);
+
+snapshot:
+ bdi->written_stamp = written;
+ bdi->bw_time_stamp = now;
+}
+
+static void bdi_update_bandwidth(struct backing_dev_info *bdi,
+ unsigned long thresh,
+ unsigned long dirty,
+ unsigned long bdi_thresh,
+ unsigned long bdi_dirty,
+ unsigned long start_time)
+{
+ if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
+ return;
+ spin_lock(&bdi->wb.list_lock);
+ __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
+ start_time);
+ spin_unlock(&bdi->wb.list_lock);
+}
+
/*
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
@@ -478,27 +652,25 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
static void balance_dirty_pages(struct address_space *mapping,
unsigned long write_chunk)
{
- long nr_reclaimable, bdi_nr_reclaimable;
- long nr_writeback, bdi_nr_writeback;
+ unsigned long nr_reclaimable, bdi_nr_reclaimable;
+ unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
+ unsigned long bdi_dirty;
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
+ unsigned long task_bdi_thresh;
+ unsigned long min_task_bdi_thresh;
unsigned long pages_written = 0;
unsigned long pause = 1;
bool dirty_exceeded = false;
+ bool clear_dirty_exceeded = true;
struct backing_dev_info *bdi = mapping->backing_dev_info;
+ unsigned long start_time = jiffies;
for (;;) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = NULL,
- .nr_to_write = write_chunk,
- .range_cyclic = 1,
- };
-
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
- nr_writeback = global_page_state(NR_WRITEBACK);
+ nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -507,12 +679,12 @@ static void balance_dirty_pages(struct address_space *mapping,
* catch-up. This avoids (excessively) small writeouts
* when the bdi limits are ramping up.
*/
- if (nr_reclaimable + nr_writeback <=
- (background_thresh + dirty_thresh) / 2)
+ if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
break;
bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
- bdi_thresh = task_dirty_limit(current, bdi_thresh);
+ min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
+ task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
/*
* In order to avoid the stacked BDI deadlock we need
@@ -524,12 +696,14 @@ static void balance_dirty_pages(struct address_space *mapping,
* actually dirty; with m+n sitting in the percpu
* deltas.
*/
- if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+ if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
- bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+ bdi_dirty = bdi_nr_reclaimable +
+ bdi_stat_sum(bdi, BDI_WRITEBACK);
} else {
bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
- bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+ bdi_dirty = bdi_nr_reclaimable +
+ bdi_stat(bdi, BDI_WRITEBACK);
}
/*
@@ -538,9 +712,10 @@ static void balance_dirty_pages(struct address_space *mapping,
* bdi or process from holding back light ones; The latter is
* the last resort safeguard.
*/
- dirty_exceeded =
- (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
- || (nr_reclaimable + nr_writeback > dirty_thresh);
+ dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
+ (nr_dirty > dirty_thresh);
+ clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
+ (nr_dirty <= dirty_thresh);
if (!dirty_exceeded)
break;
@@ -548,6 +723,9 @@ static void balance_dirty_pages(struct address_space *mapping,
if (!bdi->dirty_exceeded)
bdi->dirty_exceeded = 1;
+ bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
+ bdi_thresh, bdi_dirty, start_time);
+
/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
* Unstable writes are a feature of certain networked
* filesystems (i.e. NFS) in which data may have been
@@ -557,17 +735,40 @@ static void balance_dirty_pages(struct address_space *mapping,
* threshold otherwise wait until the disk writes catch
* up.
*/
- trace_wbc_balance_dirty_start(&wbc, bdi);
- if (bdi_nr_reclaimable > bdi_thresh) {
- writeback_inodes_wb(&bdi->wb, &wbc);
- pages_written += write_chunk - wbc.nr_to_write;
- trace_wbc_balance_dirty_written(&wbc, bdi);
+ trace_balance_dirty_start(bdi);
+ if (bdi_nr_reclaimable > task_bdi_thresh) {
+ pages_written += writeback_inodes_wb(&bdi->wb,
+ write_chunk);
+ trace_balance_dirty_written(bdi, pages_written);
if (pages_written >= write_chunk)
break; /* We've done our duty */
}
- trace_wbc_balance_dirty_wait(&wbc, bdi);
__set_current_state(TASK_UNINTERRUPTIBLE);
io_schedule_timeout(pause);
+ trace_balance_dirty_wait(bdi);
+
+ dirty_thresh = hard_dirty_limit(dirty_thresh);
+ /*
+ * max-pause area. If dirty exceeded but still within this
+ * area, no need to sleep for more than 200ms: (a) 8 pages per
+ * 200ms is typically more than enough to curb heavy dirtiers;
+ * (b) the pause time limit makes the dirtiers more responsive.
+ */
+ if (nr_dirty < dirty_thresh +
+ dirty_thresh / DIRTY_MAXPAUSE_AREA &&
+ time_after(jiffies, start_time + MAX_PAUSE))
+ break;
+ /*
+ * pass-good area. When some bdi gets blocked (eg. NFS server
+ * not responding), or write bandwidth dropped dramatically due
+ * to concurrent reads, or dirty threshold suddenly dropped and
+ * the dirty pages cannot be brought down anytime soon (eg. on
+ * slow USB stick), at least let go of the good bdi's.
+ */
+ if (nr_dirty < dirty_thresh +
+ dirty_thresh / DIRTY_PASSGOOD_AREA &&
+ bdi_dirty < bdi_thresh)
+ break;
/*
* Increase the delay for each loop, up to our previous
@@ -578,7 +779,8 @@ static void balance_dirty_pages(struct address_space *mapping,
pause = HZ / 10;
}
- if (!dirty_exceeded && bdi->dirty_exceeded)
+ /* Clear dirty_exceeded flag only when no task can exceed the limit */
+ if (clear_dirty_exceeded && bdi->dirty_exceeded)
bdi->dirty_exceeded = 0;
if (writeback_in_progress(bdi))
@@ -626,9 +828,13 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
unsigned long nr_pages_dirtied)
{
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long ratelimit;
unsigned long *p;
+ if (!bdi_cap_account_dirty(bdi))
+ return;
+
ratelimit = ratelimit_pages;
if (mapping->backing_dev_info->dirty_exceeded)
ratelimit = 8;
@@ -892,12 +1098,12 @@ int write_cache_pages(struct address_space *mapping,
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
}
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
retry:
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && (index <= end)) {
@@ -1141,7 +1347,6 @@ EXPORT_SYMBOL(account_page_dirtied);
void account_page_writeback(struct page *page)
{
inc_zone_page_state(page, NR_WRITEBACK);
- inc_zone_page_state(page, NR_WRITTEN);
}
EXPORT_SYMBOL(account_page_writeback);
@@ -1358,8 +1563,10 @@ int test_clear_page_writeback(struct page *page)
} else {
ret = TestClearPageWriteback(page);
}
- if (ret)
+ if (ret) {
dec_zone_page_state(page, NR_WRITEBACK);
+ inc_zone_page_state(page, NR_WRITTEN);
+ }
return ret;
}
@@ -1405,10 +1612,6 @@ EXPORT_SYMBOL(test_set_page_writeback);
*/
int mapping_tagged(struct address_space *mapping, int tag)
{
- int ret;
- rcu_read_lock();
- ret = radix_tree_tagged(&mapping->page_tree, tag);
- rcu_read_unlock();
- return ret;
+ return radix_tree_tagged(&mapping->page_tree, tag);
}
EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e8985acdab8..094472377d81 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1616,6 +1616,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
set_bit(i, zlc->fullzones);
}
+/*
+ * clear all zones full, called after direct reclaim makes progress so that
+ * a zone that was recently full is not skipped over for up to a second
+ */
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return;
+
+ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+}
+
#else /* CONFIG_NUMA */
static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1632,6 +1647,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
{
}
+
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+}
#endif /* CONFIG_NUMA */
/*
@@ -1664,7 +1683,7 @@ zonelist_scan:
continue;
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
- goto try_next_zone;
+ continue;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1676,17 +1695,36 @@ zonelist_scan:
classzone_idx, alloc_flags))
goto try_this_zone;
+ if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
+ /*
+ * we do zlc_setup if there are multiple nodes
+ * and before considering the first zone allowed
+ * by the cpuset.
+ */
+ allowednodes = zlc_setup(zonelist, alloc_flags);
+ zlc_active = 1;
+ did_zlc_setup = 1;
+ }
+
if (zone_reclaim_mode == 0)
goto this_zone_full;
+ /*
+ * As we may have just activated ZLC, check if the first
+ * eligible zone has failed zone_reclaim recently.
+ */
+ if (NUMA_BUILD && zlc_active &&
+ !zlc_zone_worth_trying(zonelist, z, allowednodes))
+ continue;
+
ret = zone_reclaim(zone, gfp_mask, order);
switch (ret) {
case ZONE_RECLAIM_NOSCAN:
/* did not scan */
- goto try_next_zone;
+ continue;
case ZONE_RECLAIM_FULL:
/* scanned but unreclaimable */
- goto this_zone_full;
+ continue;
default:
/* did we reclaim enough */
if (!zone_watermark_ok(zone, order, mark,
@@ -1703,16 +1741,6 @@ try_this_zone:
this_zone_full:
if (NUMA_BUILD)
zlc_mark_zone_full(zonelist, z);
-try_next_zone:
- if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
- /*
- * we do zlc_setup after the first zone is tried but only
- * if there are multiple nodes make it worthwhile
- */
- allowednodes = zlc_setup(zonelist, alloc_flags);
- zlc_active = 1;
- did_zlc_setup = 1;
- }
}
if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
@@ -1954,6 +1982,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
if (unlikely(!(*did_some_progress)))
return NULL;
+ /* After successful reclaim, reconsider all zones for allocation */
+ if (NUMA_BUILD)
+ zlc_clear_zones_full(zonelist);
+
retry:
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
@@ -4585,6 +4617,60 @@ void __init sort_node_map(void)
cmp_node_active_region, NULL);
}
+/**
+ * node_map_pfn_alignment - determine the maximum internode alignment
+ *
+ * This function should be called after node map is populated and sorted.
+ * It calculates the maximum power of two alignment which can distinguish
+ * all the nodes.
+ *
+ * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
+ * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
+ * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
+ * shifted, 1GiB is enough and this function will indicate so.
+ *
+ * This is used to test whether pfn -> nid mapping of the chosen memory
+ * model has fine enough granularity to avoid incorrect mapping for the
+ * populated node map.
+ *
+ * Returns the determined alignment in pfn's. 0 if there is no alignment
+ * requirement (single node).
+ */
+unsigned long __init node_map_pfn_alignment(void)
+{
+ unsigned long accl_mask = 0, last_end = 0;
+ int last_nid = -1;
+ int i;
+
+ for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
+ int nid = early_node_map[i].nid;
+ unsigned long start = early_node_map[i].start_pfn;
+ unsigned long end = early_node_map[i].end_pfn;
+ unsigned long mask;
+
+ if (!start || last_nid < 0 || last_nid == nid) {
+ last_nid = nid;
+ last_end = end;
+ continue;
+ }
+
+ /*
+ * Start with a mask granular enough to pin-point to the
+ * start pfn and tick off bits one-by-one until it becomes
+ * too coarse to separate the current node from the last.
+ */
+ mask = ~((1 << __ffs(start)) - 1);
+ while (mask && last_end <= (start & (mask << 1)))
+ mask <<= 1;
+
+ /* accumulate all internode masks */
+ accl_mask |= mask;
+ }
+
+ /* convert mask to number of pages */
+ return ~accl_mask + 1;
+}
+
/* Find the lowest pfn for a node */
static unsigned long __init find_min_pfn_for_node(int nid)
{
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 53bffc6c293e..39d216d535ea 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -225,8 +225,8 @@ int __meminit online_page_cgroup(unsigned long start_pfn,
unsigned long start, end, pfn;
int fail = 0;
- start = start_pfn & ~(PAGES_PER_SECTION - 1);
- end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+ start = SECTION_ALIGN_DOWN(start_pfn);
+ end = SECTION_ALIGN_UP(start_pfn + nr_pages);
if (nid == -1) {
/*
@@ -258,8 +258,8 @@ int __meminit offline_page_cgroup(unsigned long start_pfn,
{
unsigned long start, end, pfn;
- start = start_pfn & ~(PAGES_PER_SECTION - 1);
- end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+ start = SECTION_ALIGN_DOWN(start_pfn);
+ end = SECTION_ALIGN_UP(start_pfn + nr_pages);
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
__free_page_cgroup(pfn);
@@ -537,7 +537,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
nomem:
printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
printk(KERN_INFO
- "swap_cgroup can be disabled by noswapaccount boot option\n");
+ "swap_cgroup can be disabled by swapaccount=0 boot option\n");
return -ENOMEM;
}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index c3450d533611..2f5cf10ff660 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -126,7 +126,39 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
return 0;
}
-#endif
+
+static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
+{
+ struct vm_area_struct *vma;
+
+ /* We don't need vma lookup at all. */
+ if (!walk->hugetlb_entry)
+ return NULL;
+
+ VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
+ vma = find_vma(walk->mm, addr);
+ if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
+ return vma;
+
+ return NULL;
+}
+
+#else /* CONFIG_HUGETLB_PAGE */
+static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
+{
+ return NULL;
+}
+
+static int walk_hugetlb_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ return 0;
+}
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
+
/**
* walk_page_range - walk a memory map's page tables with a callback
@@ -144,11 +176,15 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
* associated range, and a copy of the original mm_walk for access to
* the ->private or ->mm fields.
*
- * No locks are taken, but the bottom level iterator will map PTE
+ * Usually no locks are taken, but splitting transparent huge page may
+ * take page table lock. And the bottom level iterator will map PTE
* directories from highmem if necessary.
*
* If any callback returns a non-zero value, the walk is aborted and
* the return value is propagated back to the caller. Otherwise 0 is returned.
+ *
+ * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry
+ * is !NULL.
*/
int walk_page_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -165,18 +201,17 @@ int walk_page_range(unsigned long addr, unsigned long end,
pgd = pgd_offset(walk->mm, addr);
do {
- struct vm_area_struct *uninitialized_var(vma);
+ struct vm_area_struct *vma;
next = pgd_addr_end(addr, end);
-#ifdef CONFIG_HUGETLB_PAGE
/*
* handle hugetlb vma individually because pagetable walk for
* the hugetlb page is dependent on the architecture and
* we can't handled it in the same manner as non-huge pages.
*/
- vma = find_vma(walk->mm, addr);
- if (vma && is_vm_hugetlb_page(vma)) {
+ vma = hugetlb_vma(addr, walk);
+ if (vma) {
if (vma->vm_end < next)
next = vma->vm_end;
/*
@@ -189,7 +224,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
pgd = pgd_offset(walk->mm, next);
continue;
}
-#endif
+
if (pgd_none_or_clear_bad(pgd)) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
diff --git a/mm/rmap.c b/mm/rmap.c
index 27dfd3b82b0f..8005080fb9e3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,7 +21,6 @@
* Lock ordering in mm:
*
* inode->i_mutex (while writing or truncating, not reading or faulting)
- * inode->i_alloc_sem (vmtruncate_range)
* mm->mmap_sem
* page->flags PG_locked (lock_page)
* mapping->i_mmap_mutex
@@ -32,15 +31,14 @@
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
+ * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
* mapping->tree_lock (widely used, in set_page_dirty,
* in arch-dependent flush_dcache_mmap_lock,
- * within inode_wb_list_lock in __sync_single_inode)
+ * within bdi.wb->list_lock in __sync_single_inode)
*
- * (code doesn't rely on that order so it could be switched around)
- * ->tasklist_lock
- * anon_vma->mutex (memory_failure, collect_procs_anon)
+ * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
+ * ->tasklist_lock
* pte map lock
*/
@@ -871,11 +869,11 @@ int page_referenced(struct page *page,
vm_flags);
if (we_locked)
unlock_page(page);
+
+ if (page_test_and_clear_young(page_to_pfn(page)))
+ referenced++;
}
out:
- if (page_test_and_clear_young(page_to_pfn(page)))
- referenced++;
-
return referenced;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index d221a1cfd7b1..5cc21f8b4cd3 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -51,6 +51,7 @@ static struct vfsmount *shm_mnt;
#include <linux/shmem_fs.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
+#include <linux/splice.h>
#include <linux/security.h>
#include <linux/swapops.h>
#include <linux/mempolicy.h>
@@ -126,8 +127,15 @@ static unsigned long shmem_default_max_inodes(void)
}
#endif
-static int shmem_getpage(struct inode *inode, unsigned long idx,
- struct page **pagep, enum sgp_type sgp, int *type);
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
+
+static inline int shmem_getpage(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp, int *fault_type)
+{
+ return shmem_getpage_gfp(inode, index, pagep, sgp,
+ mapping_gfp_mask(inode->i_mapping), fault_type);
+}
static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
{
@@ -241,9 +249,7 @@ static void shmem_free_blocks(struct inode *inode, long pages)
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks) {
percpu_counter_add(&sbinfo->used_blocks, -pages);
- spin_lock(&inode->i_lock);
inode->i_blocks -= pages*BLOCKS_PER_PAGE;
- spin_unlock(&inode->i_lock);
}
}
@@ -405,10 +411,12 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
* @info: info structure for the inode
* @index: index of the page to find
* @sgp: check and recheck i_size? skip allocation?
+ * @gfp: gfp mask to use for any page allocation
*
* If the entry does not exist, allocate it.
*/
-static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
+static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info,
+ unsigned long index, enum sgp_type sgp, gfp_t gfp)
{
struct inode *inode = &info->vfs_inode;
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
@@ -432,13 +440,11 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
sbinfo->max_blocks - 1) >= 0)
return ERR_PTR(-ENOSPC);
percpu_counter_inc(&sbinfo->used_blocks);
- spin_lock(&inode->i_lock);
inode->i_blocks += BLOCKS_PER_PAGE;
- spin_unlock(&inode->i_lock);
}
spin_unlock(&info->lock);
- page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
+ page = shmem_dir_alloc(gfp);
spin_lock(&info->lock);
if (!page) {
@@ -539,7 +545,7 @@ static void shmem_free_pages(struct list_head *next)
} while (next);
}
-static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
{
struct shmem_inode_info *info = SHMEM_I(inode);
unsigned long idx;
@@ -562,6 +568,8 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
spinlock_t *punch_lock;
unsigned long upper_limit;
+ truncate_inode_pages_range(inode->i_mapping, start, end);
+
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (idx >= info->next_index)
@@ -738,16 +746,8 @@ done2:
* lowered next_index. Also, though shmem_getpage checks
* i_size before adding to cache, no recheck after: so fix the
* narrow window there too.
- *
- * Recalling truncate_inode_pages_range and unmap_mapping_range
- * every time for punch_hole (which never got a chance to clear
- * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive,
- * yet hardly ever necessary: try to optimize them out later.
*/
truncate_inode_pages_range(inode->i_mapping, start, end);
- if (punch_hole)
- unmap_mapping_range(inode->i_mapping, start,
- end - start, 1);
}
spin_lock(&info->lock);
@@ -766,22 +766,23 @@ done2:
shmem_free_pages(pages_to_free.next);
}
}
+EXPORT_SYMBOL_GPL(shmem_truncate_range);
-static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
+static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
- loff_t newsize = attr->ia_size;
int error;
error = inode_change_ok(inode, attr);
if (error)
return error;
- if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)
- && newsize != inode->i_size) {
+ if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+ loff_t oldsize = inode->i_size;
+ loff_t newsize = attr->ia_size;
struct page *page = NULL;
- if (newsize < inode->i_size) {
+ if (newsize < oldsize) {
/*
* If truncating down to a partial page, then
* if that page is already allocated, hold it
@@ -810,12 +811,19 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
spin_unlock(&info->lock);
}
}
-
- /* XXX(truncate): truncate_setsize should be called last */
- truncate_setsize(inode, newsize);
+ if (newsize != oldsize) {
+ i_size_write(inode, newsize);
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ }
+ if (newsize < oldsize) {
+ loff_t holebegin = round_up(newsize, PAGE_SIZE);
+ unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
+ shmem_truncate_range(inode, newsize, (loff_t)-1);
+ /* unmap again to remove racily COWed private pages */
+ unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
+ }
if (page)
page_cache_release(page);
- shmem_truncate_range(inode, newsize, (loff_t)-1);
}
setattr_copy(inode, attr);
@@ -832,7 +840,6 @@ static void shmem_evict_inode(struct inode *inode)
struct shmem_xattr *xattr, *nxattr;
if (inode->i_mapping->a_ops == &shmem_aops) {
- truncate_inode_pages(inode->i_mapping, 0);
shmem_unacct_size(info->flags, inode->i_size);
inode->i_size = 0;
shmem_truncate_range(inode, 0, (loff_t)-1);
@@ -965,20 +972,7 @@ found:
error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
/* which does mem_cgroup_uncharge_cache_page on error */
- if (error == -EEXIST) {
- struct page *filepage = find_get_page(mapping, idx);
- error = 1;
- if (filepage) {
- /*
- * There might be a more uptodate page coming down
- * from a stacked writepage: forget our swappage if so.
- */
- if (PageUptodate(filepage))
- error = 0;
- page_cache_release(filepage);
- }
- }
- if (!error) {
+ if (error != -ENOMEM) {
delete_from_swap_cache(page);
set_page_dirty(page);
info->flags |= SHMEM_PAGEIN;
@@ -1065,16 +1059,17 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
/*
* shmem_backing_dev_info's capabilities prevent regular writeback or
* sync from ever calling shmem_writepage; but a stacking filesystem
- * may use the ->writepage of its underlying filesystem, in which case
+ * might use ->writepage of its underlying filesystem, in which case
* tmpfs should write out to swap only in response to memory pressure,
- * and not for the writeback threads or sync. However, in those cases,
- * we do still want to check if there's a redundant swappage to be
- * discarded.
+ * and not for the writeback threads or sync.
*/
- if (wbc->for_reclaim)
- swap = get_swap_page();
- else
- swap.val = 0;
+ if (!wbc->for_reclaim) {
+ WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
+ goto redirty;
+ }
+ swap = get_swap_page();
+ if (!swap.val)
+ goto redirty;
/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
@@ -1085,15 +1080,12 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
* we've taken the spinlock, because shmem_unuse_inode() will
* prune a !swapped inode from the swaplist under both locks.
*/
- if (swap.val) {
- mutex_lock(&shmem_swaplist_mutex);
- if (list_empty(&info->swaplist))
- list_add_tail(&info->swaplist, &shmem_swaplist);
- }
+ mutex_lock(&shmem_swaplist_mutex);
+ if (list_empty(&info->swaplist))
+ list_add_tail(&info->swaplist, &shmem_swaplist);
spin_lock(&info->lock);
- if (swap.val)
- mutex_unlock(&shmem_swaplist_mutex);
+ mutex_unlock(&shmem_swaplist_mutex);
if (index >= info->next_index) {
BUG_ON(!(info->flags & SHMEM_TRUNCATE));
@@ -1101,16 +1093,13 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
}
entry = shmem_swp_entry(info, index, NULL);
if (entry->val) {
- /*
- * The more uptodate page coming down from a stacked
- * writepage should replace our old swappage.
- */
+ WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
free_swap_and_cache(*entry);
shmem_swp_set(info, entry, 0);
}
shmem_recalc_inode(inode);
- if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
+ if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
delete_from_page_cache(page);
shmem_swp_set(info, entry, swap.val);
shmem_swp_unmap(entry);
@@ -1227,92 +1216,83 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
#endif
/*
- * shmem_getpage - either get the page from swap or allocate a new one
+ * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
*
* If we allocate a new one we do not mark it dirty. That's up to the
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache
*/
-static int shmem_getpage(struct inode *inode, unsigned long idx,
- struct page **pagep, enum sgp_type sgp, int *type)
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx,
+ struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo;
- struct page *filepage = *pagep;
- struct page *swappage;
+ struct page *page;
struct page *prealloc_page = NULL;
swp_entry_t *entry;
swp_entry_t swap;
- gfp_t gfp;
int error;
+ int ret;
if (idx >= SHMEM_MAX_INDEX)
return -EFBIG;
-
- if (type)
- *type = 0;
-
- /*
- * Normally, filepage is NULL on entry, and either found
- * uptodate immediately, or allocated and zeroed, or read
- * in under swappage, which is then assigned to filepage.
- * But shmem_readpage (required for splice) passes in a locked
- * filepage, which may be found not uptodate by other callers
- * too, and may need to be copied from the swappage read in.
- */
repeat:
- if (!filepage)
- filepage = find_lock_page(mapping, idx);
- if (filepage && PageUptodate(filepage))
- goto done;
- gfp = mapping_gfp_mask(mapping);
- if (!filepage) {
+ page = find_lock_page(mapping, idx);
+ if (page) {
/*
- * Try to preload while we can wait, to not make a habit of
- * draining atomic reserves; but don't latch on to this cpu.
+ * Once we can get the page lock, it must be uptodate:
+ * if there were an error in reading back from swap,
+ * the page would not be inserted into the filecache.
*/
- error = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
- if (error)
- goto failed;
- radix_tree_preload_end();
- if (sgp != SGP_READ && !prealloc_page) {
- /* We don't care if this fails */
- prealloc_page = shmem_alloc_page(gfp, info, idx);
- if (prealloc_page) {
- if (mem_cgroup_cache_charge(prealloc_page,
- current->mm, GFP_KERNEL)) {
- page_cache_release(prealloc_page);
- prealloc_page = NULL;
- }
+ BUG_ON(!PageUptodate(page));
+ goto done;
+ }
+
+ /*
+ * Try to preload while we can wait, to not make a habit of
+ * draining atomic reserves; but don't latch on to this cpu.
+ */
+ error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+ if (error)
+ goto out;
+ radix_tree_preload_end();
+
+ if (sgp != SGP_READ && !prealloc_page) {
+ prealloc_page = shmem_alloc_page(gfp, info, idx);
+ if (prealloc_page) {
+ SetPageSwapBacked(prealloc_page);
+ if (mem_cgroup_cache_charge(prealloc_page,
+ current->mm, GFP_KERNEL)) {
+ page_cache_release(prealloc_page);
+ prealloc_page = NULL;
}
}
}
- error = 0;
spin_lock(&info->lock);
shmem_recalc_inode(inode);
- entry = shmem_swp_alloc(info, idx, sgp);
+ entry = shmem_swp_alloc(info, idx, sgp, gfp);
if (IS_ERR(entry)) {
spin_unlock(&info->lock);
error = PTR_ERR(entry);
- goto failed;
+ goto out;
}
swap = *entry;
if (swap.val) {
/* Look it up and read it in.. */
- swappage = lookup_swap_cache(swap);
- if (!swappage) {
+ page = lookup_swap_cache(swap);
+ if (!page) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
/* here we actually do the io */
- if (type)
- *type |= VM_FAULT_MAJOR;
- swappage = shmem_swapin(swap, gfp, info, idx);
- if (!swappage) {
+ if (fault_type)
+ *fault_type |= VM_FAULT_MAJOR;
+ page = shmem_swapin(swap, gfp, info, idx);
+ if (!page) {
spin_lock(&info->lock);
- entry = shmem_swp_alloc(info, idx, sgp);
+ entry = shmem_swp_alloc(info, idx, sgp, gfp);
if (IS_ERR(entry))
error = PTR_ERR(entry);
else {
@@ -1322,62 +1302,42 @@ repeat:
}
spin_unlock(&info->lock);
if (error)
- goto failed;
+ goto out;
goto repeat;
}
- wait_on_page_locked(swappage);
- page_cache_release(swappage);
+ wait_on_page_locked(page);
+ page_cache_release(page);
goto repeat;
}
/* We have to do this with page locked to prevent races */
- if (!trylock_page(swappage)) {
+ if (!trylock_page(page)) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
- wait_on_page_locked(swappage);
- page_cache_release(swappage);
+ wait_on_page_locked(page);
+ page_cache_release(page);
goto repeat;
}
- if (PageWriteback(swappage)) {
+ if (PageWriteback(page)) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
- wait_on_page_writeback(swappage);
- unlock_page(swappage);
- page_cache_release(swappage);
+ wait_on_page_writeback(page);
+ unlock_page(page);
+ page_cache_release(page);
goto repeat;
}
- if (!PageUptodate(swappage)) {
+ if (!PageUptodate(page)) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
- unlock_page(swappage);
- page_cache_release(swappage);
+ unlock_page(page);
+ page_cache_release(page);
error = -EIO;
- goto failed;
+ goto out;
}
- if (filepage) {
- shmem_swp_set(info, entry, 0);
- shmem_swp_unmap(entry);
- delete_from_swap_cache(swappage);
- spin_unlock(&info->lock);
- copy_highpage(filepage, swappage);
- unlock_page(swappage);
- page_cache_release(swappage);
- flush_dcache_page(filepage);
- SetPageUptodate(filepage);
- set_page_dirty(filepage);
- swap_free(swap);
- } else if (!(error = add_to_page_cache_locked(swappage, mapping,
- idx, GFP_NOWAIT))) {
- info->flags |= SHMEM_PAGEIN;
- shmem_swp_set(info, entry, 0);
- shmem_swp_unmap(entry);
- delete_from_swap_cache(swappage);
- spin_unlock(&info->lock);
- filepage = swappage;
- set_page_dirty(filepage);
- swap_free(swap);
- } else {
+ error = add_to_page_cache_locked(page, mapping,
+ idx, GFP_NOWAIT);
+ if (error) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
if (error == -ENOMEM) {
@@ -1386,32 +1346,38 @@ repeat:
* call memcg's OOM if needed.
*/
error = mem_cgroup_shmem_charge_fallback(
- swappage,
- current->mm,
- gfp);
+ page, current->mm, gfp);
if (error) {
- unlock_page(swappage);
- page_cache_release(swappage);
- goto failed;
+ unlock_page(page);
+ page_cache_release(page);
+ goto out;
}
}
- unlock_page(swappage);
- page_cache_release(swappage);
+ unlock_page(page);
+ page_cache_release(page);
goto repeat;
}
- } else if (sgp == SGP_READ && !filepage) {
+
+ info->flags |= SHMEM_PAGEIN;
+ shmem_swp_set(info, entry, 0);
shmem_swp_unmap(entry);
- filepage = find_get_page(mapping, idx);
- if (filepage &&
- (!PageUptodate(filepage) || !trylock_page(filepage))) {
+ delete_from_swap_cache(page);
+ spin_unlock(&info->lock);
+ set_page_dirty(page);
+ swap_free(swap);
+
+ } else if (sgp == SGP_READ) {
+ shmem_swp_unmap(entry);
+ page = find_get_page(mapping, idx);
+ if (page && !trylock_page(page)) {
spin_unlock(&info->lock);
- wait_on_page_locked(filepage);
- page_cache_release(filepage);
- filepage = NULL;
+ wait_on_page_locked(page);
+ page_cache_release(page);
goto repeat;
}
spin_unlock(&info->lock);
- } else {
+
+ } else if (prealloc_page) {
shmem_swp_unmap(entry);
sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks) {
@@ -1420,126 +1386,86 @@ repeat:
shmem_acct_block(info->flags))
goto nospace;
percpu_counter_inc(&sbinfo->used_blocks);
- spin_lock(&inode->i_lock);
inode->i_blocks += BLOCKS_PER_PAGE;
- spin_unlock(&inode->i_lock);
} else if (shmem_acct_block(info->flags))
goto nospace;
- if (!filepage) {
- int ret;
-
- if (!prealloc_page) {
- spin_unlock(&info->lock);
- filepage = shmem_alloc_page(gfp, info, idx);
- if (!filepage) {
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- error = -ENOMEM;
- goto failed;
- }
- SetPageSwapBacked(filepage);
+ page = prealloc_page;
+ prealloc_page = NULL;
- /*
- * Precharge page while we can wait, compensate
- * after
- */
- error = mem_cgroup_cache_charge(filepage,
- current->mm, GFP_KERNEL);
- if (error) {
- page_cache_release(filepage);
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- filepage = NULL;
- goto failed;
- }
-
- spin_lock(&info->lock);
- } else {
- filepage = prealloc_page;
- prealloc_page = NULL;
- SetPageSwapBacked(filepage);
- }
-
- entry = shmem_swp_alloc(info, idx, sgp);
- if (IS_ERR(entry))
- error = PTR_ERR(entry);
- else {
- swap = *entry;
- shmem_swp_unmap(entry);
- }
- ret = error || swap.val;
- if (ret)
- mem_cgroup_uncharge_cache_page(filepage);
- else
- ret = add_to_page_cache_lru(filepage, mapping,
+ entry = shmem_swp_alloc(info, idx, sgp, gfp);
+ if (IS_ERR(entry))
+ error = PTR_ERR(entry);
+ else {
+ swap = *entry;
+ shmem_swp_unmap(entry);
+ }
+ ret = error || swap.val;
+ if (ret)
+ mem_cgroup_uncharge_cache_page(page);
+ else
+ ret = add_to_page_cache_lru(page, mapping,
idx, GFP_NOWAIT);
- /*
- * At add_to_page_cache_lru() failure, uncharge will
- * be done automatically.
- */
- if (ret) {
- spin_unlock(&info->lock);
- page_cache_release(filepage);
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- filepage = NULL;
- if (error)
- goto failed;
- goto repeat;
- }
- info->flags |= SHMEM_PAGEIN;
+ /*
+ * At add_to_page_cache_lru() failure,
+ * uncharge will be done automatically.
+ */
+ if (ret) {
+ shmem_unacct_blocks(info->flags, 1);
+ shmem_free_blocks(inode, 1);
+ spin_unlock(&info->lock);
+ page_cache_release(page);
+ if (error)
+ goto out;
+ goto repeat;
}
+ info->flags |= SHMEM_PAGEIN;
info->alloced++;
spin_unlock(&info->lock);
- clear_highpage(filepage);
- flush_dcache_page(filepage);
- SetPageUptodate(filepage);
+ clear_highpage(page);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
if (sgp == SGP_DIRTY)
- set_page_dirty(filepage);
+ set_page_dirty(page);
+
+ } else {
+ spin_unlock(&info->lock);
+ error = -ENOMEM;
+ goto out;
}
done:
- *pagep = filepage;
+ *pagep = page;
error = 0;
- goto out;
+out:
+ if (prealloc_page) {
+ mem_cgroup_uncharge_cache_page(prealloc_page);
+ page_cache_release(prealloc_page);
+ }
+ return error;
nospace:
/*
* Perhaps the page was brought in from swap between find_lock_page
* and taking info->lock? We allow for that at add_to_page_cache_lru,
* but must also avoid reporting a spurious ENOSPC while working on a
- * full tmpfs. (When filepage has been passed in to shmem_getpage, it
- * is already in page cache, which prevents this race from occurring.)
+ * full tmpfs.
*/
- if (!filepage) {
- struct page *page = find_get_page(mapping, idx);
- if (page) {
- spin_unlock(&info->lock);
- page_cache_release(page);
- goto repeat;
- }
- }
+ page = find_get_page(mapping, idx);
spin_unlock(&info->lock);
- error = -ENOSPC;
-failed:
- if (*pagep != filepage) {
- unlock_page(filepage);
- page_cache_release(filepage);
- }
-out:
- if (prealloc_page) {
- mem_cgroup_uncharge_cache_page(prealloc_page);
- page_cache_release(prealloc_page);
+ if (page) {
+ page_cache_release(page);
+ goto repeat;
}
- return error;
+ error = -ENOSPC;
+ goto out;
}
static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
int error;
- int ret;
+ int ret = VM_FAULT_LOCKED;
if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
return VM_FAULT_SIGBUS;
@@ -1547,11 +1473,12 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
+
if (ret & VM_FAULT_MAJOR) {
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
}
- return ret | VM_FAULT_LOCKED;
+ return ret;
}
#ifdef CONFIG_NUMA
@@ -1668,19 +1595,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_symlink_inline_operations;
-/*
- * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
- * but providing them allows a tmpfs file to be used for splice, sendfile, and
- * below the loop driver, in the generic fashion that many filesystems support.
- */
-static int shmem_readpage(struct file *file, struct page *page)
-{
- struct inode *inode = page->mapping->host;
- int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
- unlock_page(page);
- return error;
-}
-
static int
shmem_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
@@ -1688,7 +1602,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
{
struct inode *inode = mapping->host;
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- *pagep = NULL;
return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
}
@@ -1845,6 +1758,119 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb,
return retval;
}
+static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct address_space *mapping = in->f_mapping;
+ struct inode *inode = mapping->host;
+ unsigned int loff, nr_pages, req_pages;
+ struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[PIPE_DEF_BUFFERS];
+ struct page *page;
+ pgoff_t index, end_index;
+ loff_t isize, left;
+ int error, page_nr;
+ struct splice_pipe_desc spd = {
+ .pages = pages,
+ .partial = partial,
+ .flags = flags,
+ .ops = &page_cache_pipe_buf_ops,
+ .spd_release = spd_release_page,
+ };
+
+ isize = i_size_read(inode);
+ if (unlikely(*ppos >= isize))
+ return 0;
+
+ left = isize - *ppos;
+ if (unlikely(left < len))
+ len = left;
+
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
+ index = *ppos >> PAGE_CACHE_SHIFT;
+ loff = *ppos & ~PAGE_CACHE_MASK;
+ req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ nr_pages = min(req_pages, pipe->buffers);
+
+ spd.nr_pages = find_get_pages_contig(mapping, index,
+ nr_pages, spd.pages);
+ index += spd.nr_pages;
+ error = 0;
+
+ while (spd.nr_pages < nr_pages) {
+ error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
+ if (error)
+ break;
+ unlock_page(page);
+ spd.pages[spd.nr_pages++] = page;
+ index++;
+ }
+
+ index = *ppos >> PAGE_CACHE_SHIFT;
+ nr_pages = spd.nr_pages;
+ spd.nr_pages = 0;
+
+ for (page_nr = 0; page_nr < nr_pages; page_nr++) {
+ unsigned int this_len;
+
+ if (!len)
+ break;
+
+ this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
+ page = spd.pages[page_nr];
+
+ if (!PageUptodate(page) || page->mapping != mapping) {
+ error = shmem_getpage(inode, index, &page,
+ SGP_CACHE, NULL);
+ if (error)
+ break;
+ unlock_page(page);
+ page_cache_release(spd.pages[page_nr]);
+ spd.pages[page_nr] = page;
+ }
+
+ isize = i_size_read(inode);
+ end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ if (unlikely(!isize || index > end_index))
+ break;
+
+ if (end_index == index) {
+ unsigned int plen;
+
+ plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ if (plen <= loff)
+ break;
+
+ this_len = min(this_len, plen - loff);
+ len = this_len;
+ }
+
+ spd.partial[page_nr].offset = loff;
+ spd.partial[page_nr].len = this_len;
+ len -= this_len;
+ loff = 0;
+ spd.nr_pages++;
+ index++;
+ }
+
+ while (page_nr < nr_pages)
+ page_cache_release(spd.pages[page_nr++]);
+
+ if (spd.nr_pages)
+ error = splice_to_pipe(pipe, &spd);
+
+ splice_shrink_spd(pipe, &spd);
+
+ if (error > 0) {
+ *ppos += error;
+ file_accessed(in);
+ }
+ return error;
+}
+
static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -2005,7 +2031,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
int error;
int len;
struct inode *inode;
- struct page *page = NULL;
+ struct page *page;
char *kaddr;
struct shmem_inode_info *info;
@@ -2683,7 +2709,6 @@ static const struct address_space_operations shmem_aops = {
.writepage = shmem_writepage,
.set_page_dirty = __set_page_dirty_no_writeback,
#ifdef CONFIG_TMPFS
- .readpage = shmem_readpage,
.write_begin = shmem_write_begin,
.write_end = shmem_write_end,
#endif
@@ -2700,13 +2725,13 @@ static const struct file_operations shmem_file_operations = {
.aio_read = shmem_file_aio_read,
.aio_write = generic_file_aio_write,
.fsync = noop_fsync,
- .splice_read = generic_file_splice_read,
+ .splice_read = shmem_file_splice_read,
.splice_write = generic_file_splice_write,
#endif
};
static const struct inode_operations shmem_inode_operations = {
- .setattr = shmem_notify_change,
+ .setattr = shmem_setattr,
.truncate_range = shmem_truncate_range,
#ifdef CONFIG_TMPFS_XATTR
.setxattr = shmem_setxattr,
@@ -2714,10 +2739,6 @@ static const struct inode_operations shmem_inode_operations = {
.listxattr = shmem_listxattr,
.removexattr = shmem_removexattr,
#endif
-#ifdef CONFIG_TMPFS_POSIX_ACL
- .check_acl = generic_check_acl,
-#endif
-
};
static const struct inode_operations shmem_dir_inode_operations = {
@@ -2739,8 +2760,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
.removexattr = shmem_removexattr,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
- .setattr = shmem_notify_change,
- .check_acl = generic_check_acl,
+ .setattr = shmem_setattr,
#endif
};
@@ -2752,8 +2772,7 @@ static const struct inode_operations shmem_special_inode_operations = {
.removexattr = shmem_removexattr,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
- .setattr = shmem_notify_change,
- .check_acl = generic_check_acl,
+ .setattr = shmem_setattr,
#endif
};
@@ -2908,6 +2927,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
return 0;
}
+void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+{
+ truncate_inode_pages_range(inode->i_mapping, start, end);
+}
+EXPORT_SYMBOL_GPL(shmem_truncate_range);
+
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
/**
* mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
@@ -3028,3 +3053,42 @@ int shmem_zero_setup(struct vm_area_struct *vma)
vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
+
+/**
+ * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @gfp: the page allocator flags to use if allocating
+ *
+ * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
+ * with any new page allocations done using the specified allocation flags.
+ * But read_cache_page_gfp() uses the ->readpage() method: which does not
+ * suit tmpfs, since it may have pages in swapcache, and needs to find those
+ * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
+ *
+ * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
+ * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
+ */
+struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
+ pgoff_t index, gfp_t gfp)
+{
+#ifdef CONFIG_SHMEM
+ struct inode *inode = mapping->host;
+ struct page *page;
+ int error;
+
+ BUG_ON(mapping->a_ops != &shmem_aops);
+ error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
+ if (error)
+ page = ERR_PTR(error);
+ else
+ unlock_page(page);
+ return page;
+#else
+ /*
+ * The tiny !SHMEM case uses ramfs without swap
+ */
+ return read_cache_page_gfp(mapping, index, gfp);
+#endif
+}
+EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/slab.c b/mm/slab.c
index d96e223de775..1e523ed47c61 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -574,7 +574,9 @@ static struct arraycache_init initarray_generic =
{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
/* internal cache of cache description objs */
+static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
static struct kmem_cache cache_cache = {
+ .nodelists = cache_cache_nodelists,
.batchcount = 1,
.limit = BOOT_CPUCACHE_ENTRIES,
.shared = 1,
@@ -1492,11 +1494,10 @@ void __init kmem_cache_init(void)
cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
/*
- * struct kmem_cache size depends on nr_node_ids, which
- * can be less than MAX_NUMNODES.
+ * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
*/
- cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
- nr_node_ids * sizeof(struct kmem_list3 *);
+ cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+ nr_node_ids * sizeof(struct kmem_list3 *);
#if DEBUG
cache_cache.obj_size = cache_cache.buffer_size;
#endif
@@ -2308,6 +2309,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (!cachep)
goto oops;
+ cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
#if DEBUG
cachep->obj_size = size;
@@ -3153,12 +3155,11 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
objp += obj_offset(cachep);
if (cachep->ctor && cachep->flags & SLAB_POISON)
cachep->ctor(objp);
-#if ARCH_SLAB_MINALIGN
- if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
+ if (ARCH_SLAB_MINALIGN &&
+ ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
- objp, ARCH_SLAB_MINALIGN);
+ objp, (int)ARCH_SLAB_MINALIGN);
}
-#endif
return objp;
}
#else
diff --git a/mm/slob.c b/mm/slob.c
index 46e0aee33a23..0ae881831ae2 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -482,6 +482,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
void *ret;
+ gfp &= gfp_allowed_mask;
+
lockdep_trace_alloc(gfp);
if (size < PAGE_SIZE - align) {
@@ -608,6 +610,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
{
void *b;
+ flags &= gfp_allowed_mask;
+
+ lockdep_trace_alloc(flags);
+
if (c->size < PAGE_SIZE) {
b = slob_alloc(c->size, flags, c->align, node);
trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
diff --git a/mm/slub.c b/mm/slub.c
index 35f351f26193..f8f5e8efeb88 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -27,6 +27,7 @@
#include <linux/memory.h>
#include <linux/math64.h>
#include <linux/fault-inject.h>
+#include <linux/stacktrace.h>
#include <trace/events/kmem.h>
@@ -191,8 +192,12 @@ static LIST_HEAD(slab_caches);
/*
* Tracking user of a slab.
*/
+#define TRACK_ADDRS_COUNT 16
struct track {
unsigned long addr; /* Called from address */
+#ifdef CONFIG_STACKTRACE
+ unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
+#endif
int cpu; /* Was running on cpu */
int pid; /* Pid context */
unsigned long when; /* When did the operation occur */
@@ -420,6 +425,24 @@ static void set_track(struct kmem_cache *s, void *object,
struct track *p = get_track(s, object, alloc);
if (addr) {
+#ifdef CONFIG_STACKTRACE
+ struct stack_trace trace;
+ int i;
+
+ trace.nr_entries = 0;
+ trace.max_entries = TRACK_ADDRS_COUNT;
+ trace.entries = p->addrs;
+ trace.skip = 3;
+ save_stack_trace(&trace);
+
+ /* See rant in lockdep.c */
+ if (trace.nr_entries != 0 &&
+ trace.entries[trace.nr_entries - 1] == ULONG_MAX)
+ trace.nr_entries--;
+
+ for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
+ p->addrs[i] = 0;
+#endif
p->addr = addr;
p->cpu = smp_processor_id();
p->pid = current->pid;
@@ -444,6 +467,16 @@ static void print_track(const char *s, struct track *t)
printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
+#ifdef CONFIG_STACKTRACE
+ {
+ int i;
+ for (i = 0; i < TRACK_ADDRS_COUNT; i++)
+ if (t->addrs[i])
+ printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]);
+ else
+ break;
+ }
+#endif
}
static void print_tracking(struct kmem_cache *s, void *object)
@@ -557,10 +590,10 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
memset(p + s->objsize, val, s->inuse - s->objsize);
}
-static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
+static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes)
{
while (bytes) {
- if (*start != (u8)value)
+ if (*start != value)
return start;
start++;
bytes--;
@@ -568,6 +601,38 @@ static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
return NULL;
}
+static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes)
+{
+ u64 value64;
+ unsigned int words, prefix;
+
+ if (bytes <= 16)
+ return check_bytes8(start, value, bytes);
+
+ value64 = value | value << 8 | value << 16 | value << 24;
+ value64 = value64 | value64 << 32;
+ prefix = 8 - ((unsigned long)start) % 8;
+
+ if (prefix) {
+ u8 *r = check_bytes8(start, value, prefix);
+ if (r)
+ return r;
+ start += prefix;
+ bytes -= prefix;
+ }
+
+ words = bytes / 8;
+
+ while (words) {
+ if (*(u64 *)start != value64)
+ return check_bytes8(start, value, 8);
+ start += 8;
+ words--;
+ }
+
+ return check_bytes8(start, value, bytes % 8);
+}
+
static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
void *from, void *to)
{
@@ -2928,6 +2993,42 @@ size_t ksize(const void *object)
}
EXPORT_SYMBOL(ksize);
+#ifdef CONFIG_SLUB_DEBUG
+bool verify_mem_not_deleted(const void *x)
+{
+ struct page *page;
+ void *object = (void *)x;
+ unsigned long flags;
+ bool rv;
+
+ if (unlikely(ZERO_OR_NULL_PTR(x)))
+ return false;
+
+ local_irq_save(flags);
+
+ page = virt_to_head_page(x);
+ if (unlikely(!PageSlab(page))) {
+ /* maybe it was from stack? */
+ rv = true;
+ goto out_unlock;
+ }
+
+ slab_lock(page);
+ if (on_freelist(page->slab, page, object)) {
+ object_err(page->slab, page, object, "Object is on free-list");
+ rv = false;
+ } else {
+ rv = true;
+ }
+ slab_unlock(page);
+
+out_unlock:
+ local_irq_restore(flags);
+ return rv;
+}
+EXPORT_SYMBOL(verify_mem_not_deleted);
+#endif
+
void kfree(const void *x)
{
struct page *page;
@@ -4058,7 +4159,7 @@ static int any_slab_objects(struct kmem_cache *s)
#endif
#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
-#define to_slab(n) container_of(n, struct kmem_cache, kobj);
+#define to_slab(n) container_of(n, struct kmem_cache, kobj)
struct slab_attribute {
struct attribute attr;
diff --git a/mm/sparse.c b/mm/sparse.c
index aa64b12831a2..858e1dff9b2a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -40,7 +40,7 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#endif
-int page_to_nid(struct page *page)
+int page_to_nid(const struct page *page)
{
return section_to_node_table[page_to_section(page)];
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d537d29e9b7b..1b8c33907242 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -14,7 +14,7 @@
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/namei.h>
-#include <linux/shm.h>
+#include <linux/shmem_fs.h>
#include <linux/blkdev.h>
#include <linux/random.h>
#include <linux/writeback.h>
@@ -1681,19 +1681,14 @@ out:
}
#ifdef CONFIG_PROC_FS
-struct proc_swaps {
- struct seq_file seq;
- int event;
-};
-
static unsigned swaps_poll(struct file *file, poll_table *wait)
{
- struct proc_swaps *s = file->private_data;
+ struct seq_file *seq = file->private_data;
poll_wait(file, &proc_poll_wait, wait);
- if (s->event != atomic_read(&proc_poll_event)) {
- s->event = atomic_read(&proc_poll_event);
+ if (seq->poll_event != atomic_read(&proc_poll_event)) {
+ seq->poll_event = atomic_read(&proc_poll_event);
return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
}
@@ -1783,24 +1778,16 @@ static const struct seq_operations swaps_op = {
static int swaps_open(struct inode *inode, struct file *file)
{
- struct proc_swaps *s;
+ struct seq_file *seq;
int ret;
- s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
- if (!s)
- return -ENOMEM;
-
- file->private_data = s;
-
ret = seq_open(file, &swaps_op);
- if (ret) {
- kfree(s);
+ if (ret)
return ret;
- }
- s->seq.private = s;
- s->event = atomic_read(&proc_poll_event);
- return ret;
+ seq = file->private_data;
+ seq->poll_event = atomic_read(&proc_poll_event);
+ return 0;
}
static const struct file_operations proc_swaps_operations = {
diff --git a/mm/thrash.c b/mm/thrash.c
index fabf2d0f5169..e53f7d02c17c 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -6,7 +6,7 @@
* Released under the GPL, see the file COPYING for details.
*
* Simple token based thrashing protection, using the algorithm
- * described in: http://www.cs.wm.edu/~sjiang/token.pdf
+ * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html
*
* Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
* Improved algorithm to pass token:
@@ -30,8 +30,6 @@
static DEFINE_SPINLOCK(swap_token_lock);
struct mm_struct *swap_token_mm;
struct mem_cgroup *swap_token_memcg;
-static unsigned int global_faults;
-static unsigned int last_aging;
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
@@ -55,6 +53,8 @@ void grab_swap_token(struct mm_struct *mm)
{
int current_interval;
unsigned int old_prio = mm->token_priority;
+ static unsigned int global_faults;
+ static unsigned int last_aging;
global_faults++;
@@ -67,6 +67,17 @@ void grab_swap_token(struct mm_struct *mm)
if (!swap_token_mm)
goto replace_token;
+ /*
+ * Usually, we don't need priority aging because long interval faults
+ * makes priority decrease quickly. But there is one exception. If the
+ * token owner task is sleeping, it never make long interval faults.
+ * Thus, we need a priority aging mechanism instead. The requirements
+ * of priority aging are
+ * 1) An aging interval is reasonable enough long. Too short aging
+ * interval makes quick swap token lost and decrease performance.
+ * 2) The swap token owner task have to get priority aging even if
+ * it's under sleep.
+ */
if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
swap_token_mm->token_priority /= 2;
last_aging = global_faults;
diff --git a/mm/truncate.c b/mm/truncate.c
index 3a29a6180212..232eb2736a79 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -199,9 +199,6 @@ int invalidate_inode_page(struct page *page)
* The first pass will remove most pages, so the search cost of the second pass
* is low.
*
- * When looking at page->index outside the page lock we need to be careful to
- * copy it into a local to avoid races (it could change at any time).
- *
* We pass down the cache-hot hint to the page freeing code. Even if the
* mapping is large, it is probably the case that the final pages are the most
* recently touched, and freeing happens in ascending file offset order.
@@ -210,10 +207,10 @@ void truncate_inode_pages_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
{
const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
- pgoff_t end;
const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
struct pagevec pvec;
- pgoff_t next;
+ pgoff_t index;
+ pgoff_t end;
int i;
cleancache_flush_inode(mapping);
@@ -224,24 +221,21 @@ void truncate_inode_pages_range(struct address_space *mapping,
end = (lend >> PAGE_CACHE_SHIFT);
pagevec_init(&pvec, 0);
- next = start;
- while (next <= end &&
- pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ index = start;
+ while (index <= end && pagevec_lookup(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
- pgoff_t page_index = page->index;
- if (page_index > end) {
- next = page_index;
+ /* We rely upon deletion not changing page->index */
+ index = page->index;
+ if (index > end)
break;
- }
- if (page_index > next)
- next = page_index;
- next++;
if (!trylock_page(page))
continue;
+ WARN_ON(page->index != index);
if (PageWriteback(page)) {
unlock_page(page);
continue;
@@ -252,6 +246,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
+ index++;
}
if (partial) {
@@ -264,16 +259,17 @@ void truncate_inode_pages_range(struct address_space *mapping,
}
}
- next = start;
+ index = start;
for ( ; ; ) {
cond_resched();
- if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
- if (next == start)
+ if (!pagevec_lookup(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+ if (index == start)
break;
- next = start;
+ index = start;
continue;
}
- if (pvec.pages[0]->index > end) {
+ if (index == start && pvec.pages[0]->index > end) {
pagevec_release(&pvec);
break;
}
@@ -281,18 +277,20 @@ void truncate_inode_pages_range(struct address_space *mapping,
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
- if (page->index > end)
+ /* We rely upon deletion not changing page->index */
+ index = page->index;
+ if (index > end)
break;
+
lock_page(page);
+ WARN_ON(page->index != index);
wait_on_page_writeback(page);
truncate_inode_page(mapping, page);
- if (page->index > next)
- next = page->index;
- next++;
unlock_page(page);
}
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
+ index++;
}
cleancache_flush_inode(mapping);
}
@@ -304,6 +302,11 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
* @lstart: offset from which to truncate
*
* Called under (and serialised by) inode->i_mutex.
+ *
+ * Note: When this function returns, there can be a page in the process of
+ * deletion (inside __delete_from_page_cache()) in the specified range. Thus
+ * mapping->nrpages can be non-zero when this function returns even after
+ * truncation of the whole mapping.
*/
void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
{
@@ -328,35 +331,26 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
struct pagevec pvec;
- pgoff_t next = start;
+ pgoff_t index = start;
unsigned long ret;
unsigned long count = 0;
int i;
pagevec_init(&pvec, 0);
- while (next <= end &&
- pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ while (index <= end && pagevec_lookup(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
- pgoff_t index;
- int lock_failed;
-
- lock_failed = !trylock_page(page);
- /*
- * We really shouldn't be looking at the ->index of an
- * unlocked page. But we're not allowed to lock these
- * pages. So we rely upon nobody altering the ->index
- * of this (pinned-by-us) page.
- */
+ /* We rely upon deletion not changing page->index */
index = page->index;
- if (index > next)
- next = index;
- next++;
- if (lock_failed)
- continue;
+ if (index > end)
+ break;
+ if (!trylock_page(page))
+ continue;
+ WARN_ON(page->index != index);
ret = invalidate_inode_page(page);
unlock_page(page);
/*
@@ -366,12 +360,11 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
if (!ret)
deactivate_page(page);
count += ret;
- if (next > end)
- break;
}
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
+ index++;
}
return count;
}
@@ -437,37 +430,32 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
struct pagevec pvec;
- pgoff_t next;
+ pgoff_t index;
int i;
int ret = 0;
int ret2 = 0;
int did_range_unmap = 0;
- int wrapped = 0;
cleancache_flush_inode(mapping);
pagevec_init(&pvec, 0);
- next = start;
- while (next <= end && !wrapped &&
- pagevec_lookup(&pvec, mapping, next,
- min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+ index = start;
+ while (index <= end && pagevec_lookup(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
- pgoff_t page_index;
+
+ /* We rely upon deletion not changing page->index */
+ index = page->index;
+ if (index > end)
+ break;
lock_page(page);
+ WARN_ON(page->index != index);
if (page->mapping != mapping) {
unlock_page(page);
continue;
}
- page_index = page->index;
- next = page_index + 1;
- if (next == 0)
- wrapped = 1;
- if (page_index > end) {
- unlock_page(page);
- break;
- }
wait_on_page_writeback(page);
if (page_mapped(page)) {
if (!did_range_unmap) {
@@ -475,9 +463,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
* Zap the rest of the file in one hit.
*/
unmap_mapping_range(mapping,
- (loff_t)page_index<<PAGE_CACHE_SHIFT,
- (loff_t)(end - page_index + 1)
- << PAGE_CACHE_SHIFT,
+ (loff_t)index << PAGE_CACHE_SHIFT,
+ (loff_t)(1 + end - index)
+ << PAGE_CACHE_SHIFT,
0);
did_range_unmap = 1;
} else {
@@ -485,8 +473,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
* Just zap this page
*/
unmap_mapping_range(mapping,
- (loff_t)page_index<<PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE, 0);
+ (loff_t)index << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE, 0);
}
}
BUG_ON(page_mapped(page));
@@ -502,6 +490,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
+ index++;
}
cleancache_flush_inode(mapping);
return ret;
@@ -526,8 +515,8 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
/**
* truncate_pagecache - unmap and remove pagecache that has been truncated
* @inode: inode
- * @old: old file offset
- * @new: new file offset
+ * @oldsize: old file size
+ * @newsize: new file size
*
* inode's new i_size must already be written before truncate_pagecache
* is called.
@@ -539,9 +528,10 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
* situations such as writepage being called for a page that has already
* had its underlying blocks deallocated.
*/
-void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
+void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize)
{
struct address_space *mapping = inode->i_mapping;
+ loff_t holebegin = round_up(newsize, PAGE_SIZE);
/*
* unmap_mapping_range is called twice, first simply for
@@ -552,9 +542,9 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
* truncate_inode_pages finishes, hence the second
* unmap_mapping_range call must be made for correctness.
*/
- unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(mapping, new);
- unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+ unmap_mapping_range(mapping, holebegin, 0, 1);
+ truncate_inode_pages(mapping, newsize);
+ unmap_mapping_range(mapping, holebegin, 0, 1);
}
EXPORT_SYMBOL(truncate_pagecache);
@@ -584,22 +574,47 @@ EXPORT_SYMBOL(truncate_setsize);
/**
* vmtruncate - unmap mappings "freed" by truncate() syscall
* @inode: inode of the file used
- * @offset: file offset to start truncating
+ * @newsize: file offset to start truncating
*
* This function is deprecated and truncate_setsize or truncate_pagecache
* should be used instead, together with filesystem specific block truncation.
*/
-int vmtruncate(struct inode *inode, loff_t offset)
+int vmtruncate(struct inode *inode, loff_t newsize)
{
int error;
- error = inode_newsize_ok(inode, offset);
+ error = inode_newsize_ok(inode, newsize);
if (error)
return error;
- truncate_setsize(inode, offset);
+ truncate_setsize(inode, newsize);
if (inode->i_op->truncate)
inode->i_op->truncate(inode);
return 0;
}
EXPORT_SYMBOL(vmtruncate);
+
+int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+ struct address_space *mapping = inode->i_mapping;
+ loff_t holebegin = round_up(lstart, PAGE_SIZE);
+ loff_t holelen = 1 + lend - holebegin;
+
+ /*
+ * If the underlying filesystem is not going to provide
+ * a way to truncate a range of blocks (punch a hole) -
+ * we should return failure right now.
+ */
+ if (!inode->i_op->truncate_range)
+ return -ENOSYS;
+
+ mutex_lock(&inode->i_mutex);
+ inode_dio_wait(inode);
+ unmap_mapping_range(mapping, holebegin, holelen, 1);
+ inode->i_op->truncate_range(inode, lstart, lend);
+ /* unmap again to remove racily COWed private pages */
+ unmap_mapping_range(mapping, holebegin, holelen, 1);
+ mutex_unlock(&inode->i_mutex);
+
+ return 0;
+}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1d34d75366a7..ab8494cde007 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -452,13 +452,6 @@ overflow:
return ERR_PTR(-EBUSY);
}
-static void rcu_free_va(struct rcu_head *head)
-{
- struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
-
- kfree(va);
-}
-
static void __free_vmap_area(struct vmap_area *va)
{
BUG_ON(RB_EMPTY_NODE(&va->rb_node));
@@ -491,7 +484,7 @@ static void __free_vmap_area(struct vmap_area *va)
if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
- call_rcu(&va->rcu_head, rcu_free_va);
+ kfree_rcu(va, rcu_head);
}
/*
@@ -837,13 +830,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
return vb;
}
-static void rcu_free_vb(struct rcu_head *head)
-{
- struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
-
- kfree(vb);
-}
-
static void free_vmap_block(struct vmap_block *vb)
{
struct vmap_block *tmp;
@@ -856,7 +842,7 @@ static void free_vmap_block(struct vmap_block *vb)
BUG_ON(tmp != vb);
free_vmap_area_noflush(vb->va);
- call_rcu(&vb->rcu_head, rcu_free_vb);
+ kfree_rcu(vb, rcu_head);
}
static void purge_fragmented_blocks(int cpu)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8ff834e19c24..febbc044e792 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -250,49 +250,90 @@ unsigned long shrink_slab(struct shrink_control *shrink,
unsigned long long delta;
unsigned long total_scan;
unsigned long max_pass;
+ int shrink_ret = 0;
+ long nr;
+ long new_nr;
+ long batch_size = shrinker->batch ? shrinker->batch
+ : SHRINK_BATCH;
+ /*
+ * copy the current shrinker scan count into a local variable
+ * and zero it so that other concurrent shrinker invocations
+ * don't also do this scanning work.
+ */
+ do {
+ nr = shrinker->nr;
+ } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
+
+ total_scan = nr;
max_pass = do_shrinker_shrink(shrinker, shrink, 0);
delta = (4 * nr_pages_scanned) / shrinker->seeks;
delta *= max_pass;
do_div(delta, lru_pages + 1);
- shrinker->nr += delta;
- if (shrinker->nr < 0) {
+ total_scan += delta;
+ if (total_scan < 0) {
printk(KERN_ERR "shrink_slab: %pF negative objects to "
"delete nr=%ld\n",
- shrinker->shrink, shrinker->nr);
- shrinker->nr = max_pass;
+ shrinker->shrink, total_scan);
+ total_scan = max_pass;
}
/*
+ * We need to avoid excessive windup on filesystem shrinkers
+ * due to large numbers of GFP_NOFS allocations causing the
+ * shrinkers to return -1 all the time. This results in a large
+ * nr being built up so when a shrink that can do some work
+ * comes along it empties the entire cache due to nr >>>
+ * max_pass. This is bad for sustaining a working set in
+ * memory.
+ *
+ * Hence only allow the shrinker to scan the entire cache when
+ * a large delta change is calculated directly.
+ */
+ if (delta < max_pass / 4)
+ total_scan = min(total_scan, max_pass / 2);
+
+ /*
* Avoid risking looping forever due to too large nr value:
* never try to free more than twice the estimate number of
* freeable entries.
*/
- if (shrinker->nr > max_pass * 2)
- shrinker->nr = max_pass * 2;
+ if (total_scan > max_pass * 2)
+ total_scan = max_pass * 2;
- total_scan = shrinker->nr;
- shrinker->nr = 0;
+ trace_mm_shrink_slab_start(shrinker, shrink, nr,
+ nr_pages_scanned, lru_pages,
+ max_pass, delta, total_scan);
- while (total_scan >= SHRINK_BATCH) {
- long this_scan = SHRINK_BATCH;
- int shrink_ret;
+ while (total_scan >= batch_size) {
int nr_before;
nr_before = do_shrinker_shrink(shrinker, shrink, 0);
shrink_ret = do_shrinker_shrink(shrinker, shrink,
- this_scan);
+ batch_size);
if (shrink_ret == -1)
break;
if (shrink_ret < nr_before)
ret += nr_before - shrink_ret;
- count_vm_events(SLABS_SCANNED, this_scan);
- total_scan -= this_scan;
+ count_vm_events(SLABS_SCANNED, batch_size);
+ total_scan -= batch_size;
cond_resched();
}
- shrinker->nr += total_scan;
+ /*
+ * move the unused scan count back into the shrinker in a
+ * manner that handles concurrent updates. If we exhausted the
+ * scan, there is no need to do an update.
+ */
+ do {
+ nr = shrinker->nr;
+ new_nr = total_scan + nr;
+ if (total_scan <= 0)
+ break;
+ } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
+
+ trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
}
up_read(&shrinker_rwsem);
out:
@@ -1995,14 +2036,13 @@ restart:
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
+static void shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
- unsigned long total_scanned = 0;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2017,19 +2057,23 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
+ /*
+ * This steals pages from memory cgroups over softlimit
+ * and returns the number of reclaimed pages and
+ * scanned pages. This works for global memory pressure
+ * and balancing, not for a memcg's limit.
+ */
+ nr_soft_scanned = 0;
+ nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+ sc->order, sc->gfp_mask,
+ &nr_soft_scanned);
+ sc->nr_reclaimed += nr_soft_reclaimed;
+ sc->nr_scanned += nr_soft_scanned;
+ /* need some check for avoid more shrink_zone() */
}
- nr_soft_scanned = 0;
- nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
- sc->order, sc->gfp_mask,
- &nr_soft_scanned);
- sc->nr_reclaimed += nr_soft_reclaimed;
- total_scanned += nr_soft_scanned;
-
shrink_zone(priority, zone, sc);
}
-
- return total_scanned;
}
static bool zone_reclaimable(struct zone *zone)
@@ -2094,7 +2138,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
sc->nr_scanned = 0;
if (!priority)
disable_swap_token(sc->mem_cgroup);
- total_scanned += shrink_zones(priority, zonelist, sc);
+ shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
@@ -2307,7 +2351,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
for (i = 0; i <= classzone_idx; i++)
present_pages += pgdat->node_zones[i].present_pages;
- return balanced_pages > (present_pages >> 2);
+ /* A special case here: if zone has no page, we think it's balanced */
+ return balanced_pages >= (present_pages >> 2);
}
/* is kswapd sleeping prematurely? */
@@ -2323,7 +2368,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
return true;
/* Check the watermark levels */
- for (i = 0; i < pgdat->nr_zones; i++) {
+ for (i = 0; i <= classzone_idx; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
@@ -2341,7 +2386,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
}
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
- classzone_idx, 0))
+ i, 0))
all_zones_ok = false;
else
balanced += zone->present_pages;
@@ -2448,7 +2493,6 @@ loop_again:
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
- *classzone_idx = i;
break;
}
}
@@ -2507,18 +2551,18 @@ loop_again:
KSWAPD_ZONE_BALANCE_GAP_RATIO);
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone) + balance_gap,
- end_zone, 0))
+ end_zone, 0)) {
shrink_zone(priority, zone, &sc);
- reclaim_state->reclaimed_slab = 0;
- nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
- sc.nr_reclaimed += reclaim_state->reclaimed_slab;
- total_scanned += sc.nr_scanned;
- if (zone->all_unreclaimable)
- continue;
- if (nr_slab == 0 &&
- !zone_reclaimable(zone))
- zone->all_unreclaimable = 1;
+ reclaim_state->reclaimed_slab = 0;
+ nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
+ sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+ total_scanned += sc.nr_scanned;
+
+ if (nr_slab == 0 && !zone_reclaimable(zone))
+ zone->all_unreclaimable = 1;
+ }
+
/*
* If we've done a decent amount of scanning and
* the reclaim ratio is low, start doing writepage
@@ -2528,6 +2572,12 @@ loop_again:
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
+ if (zone->all_unreclaimable) {
+ if (end_zone && end_zone == i)
+ end_zone--;
+ continue;
+ }
+
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), end_zone, 0)) {
all_zones_ok = 0;
@@ -2706,8 +2756,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
*/
static int kswapd(void *p)
{
- unsigned long order;
- int classzone_idx;
+ unsigned long order, new_order;
+ int classzone_idx, new_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
@@ -2737,17 +2787,23 @@ static int kswapd(void *p)
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();
- order = 0;
- classzone_idx = MAX_NR_ZONES - 1;
+ order = new_order = 0;
+ classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
for ( ; ; ) {
- unsigned long new_order;
- int new_classzone_idx;
int ret;
- new_order = pgdat->kswapd_max_order;
- new_classzone_idx = pgdat->classzone_idx;
- pgdat->kswapd_max_order = 0;
- pgdat->classzone_idx = MAX_NR_ZONES - 1;
+ /*
+ * If the last balance_pgdat was unsuccessful it's unlikely a
+ * new request of a similar or harder type will succeed soon
+ * so consider going to sleep on the basis we reclaimed at
+ */
+ if (classzone_idx >= new_classzone_idx && order == new_order) {
+ new_order = pgdat->kswapd_max_order;
+ new_classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
+ }
+
if (order < new_order || classzone_idx > new_classzone_idx) {
/*
* Don't sleep if someone wants a larger 'order'
@@ -2760,7 +2816,7 @@ static int kswapd(void *p)
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
pgdat->kswapd_max_order = 0;
- pgdat->classzone_idx = MAX_NR_ZONES - 1;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
}
ret = try_to_freeze();