From 08604bd9935dc98fb62ef61d5b7baa7ccc10f8c2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 Jun 2009 15:31:12 -0700 Subject: time: move PIT_TICK_RATE to linux/timex.h PIT_TICK_RATE is currently defined in four architectures, but in three different places. While linux/timex.h is not the perfect place for it, it is still a reasonable replacement for those drivers that traditionally use asm/timex.h to get CLOCK_TICK_RATE and expect it to be the PIT frequency. Note that for Alpha, the actual value changed from 1193182UL to 1193180UL. This is unlikely to make a difference, and probably can only improve accuracy. There was a discussion on the correct value of CLOCK_TICK_RATE a few years ago, after which every existing instance was getting changed to 1193182. According to the specification, it should be 1193181.818181... Signed-off-by: Arnd Bergmann Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Len Brown Cc: john stultz Cc: Dmitry Torokhov Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/timex.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/timex.h b/include/linux/timex.h index 9910e3bd5b31..e6967d10d9e5 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -280,6 +280,9 @@ extern int do_adjtimex(struct timex *); int read_current_timer(unsigned long *timer_val); +/* The clock frequency of the i8253/i8254 PIT */ +#define PIT_TICK_RATE 1193182ul + #endif /* KERNEL */ #endif /* LINUX_TIMEX_H */ -- cgit v1.2.3 From 3b0fde0fac19c180317eb0601b3504083f4b9bf5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 16 Jun 2009 15:31:16 -0700 Subject: firmware_map: fix hang with x86/32bit Addresses http://bugzilla.kernel.org/show_bug.cgi?id=13484 Peer reported: | The bug is introduced from kernel 2.6.27, if E820 table reserve the memory | above 4G in 32bit OS(BIOS-e820: 00000000fff80000 - 0000000120000000 | (reserved)), system will report Int 6 error and hang up. The bug is caused by | the following code in drivers/firmware/memmap.c, the resource_size_t is 32bit | variable in 32bit OS, the BUG_ON() will be invoked to result in the Int 6 | error. I try the latest 32bit Ubuntu and Fedora distributions, all hit this | bug. |====== |static int firmware_map_add_entry(resource_size_t start, resource_size_t end, | const char *type, | struct firmware_map_entry *entry) and it only happen with CONFIG_PHYS_ADDR_T_64BIT is not set. it turns out we need to pass u64 instead of resource_size_t for that. [akpm@linux-foundation.org: add comment] Reported-and-tested-by: Peer Chen Signed-off-by: Yinghai Lu Cc: Ingo Molnar Acked-by: H. Peter Anvin Cc: Thomas Gleixner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/firmware-map.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h index cca686b39123..875451f1373a 100644 --- a/include/linux/firmware-map.h +++ b/include/linux/firmware-map.h @@ -24,21 +24,17 @@ */ #ifdef CONFIG_FIRMWARE_MEMMAP -int firmware_map_add(resource_size_t start, resource_size_t end, - const char *type); -int firmware_map_add_early(resource_size_t start, resource_size_t end, - const char *type); +int firmware_map_add(u64 start, u64 end, const char *type); +int firmware_map_add_early(u64 start, u64 end, const char *type); #else /* CONFIG_FIRMWARE_MEMMAP */ -static inline int firmware_map_add(resource_size_t start, resource_size_t end, - const char *type) +static inline int firmware_map_add(u64 start, u64 end, const char *type) { return 0; } -static inline int firmware_map_add_early(resource_size_t start, - resource_size_t end, const char *type) +static inline int firmware_map_add_early(u64 start, u64 end, const char *type) { return 0; } -- cgit v1.2.3 From bb1f17b0372de93758653ca3454bc0df18dc2e5c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 16 Jun 2009 15:31:18 -0700 Subject: mm: consolidate init_mm definition * create mm/init-mm.c, move init_mm there * remove INIT_MM, initialize init_mm with C99 initializer * unexport init_mm on all arches: init_mm is already unexported on x86. One strange place is some OMAP driver (drivers/video/omap/) which won't build modular, but it's already wants get_vm_area() export. Somebody should look there. [akpm@linux-foundation.org: add missing #includes] Signed-off-by: Alexey Dobriyan Cc: Mike Frysinger Cc: Americo Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/init_task.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 28b1f30601b5..5368fbdc7801 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -15,18 +15,6 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; -#define INIT_MM(name) \ -{ \ - .mm_rb = RB_ROOT, \ - .pgd = swapper_pg_dir, \ - .mm_users = ATOMIC_INIT(2), \ - .mm_count = ATOMIC_INIT(1), \ - .mmap_sem = __RWSEM_INITIALIZER(name.mmap_sem), \ - .page_table_lock = __SPIN_LOCK_UNLOCKED(name.page_table_lock), \ - .mmlist = LIST_HEAD_INIT(name.mmlist), \ - .cpu_vm_mask = CPU_MASK_ALL, \ -} - #define INIT_SIGNALS(sig) { \ .count = ATOMIC_INIT(1), \ .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ -- cgit v1.2.3 From 1ebf26a9b338534def47f307c6c8694b6dfc0a79 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:19 -0700 Subject: readahead: make mmap_miss an unsigned int This makes the performance impact of possible mmap_miss wrap around to be temporary and tolerable: i.e. MMAP_LOTSAMISS=100 extra readarounds. Otherwise if ever mmap_miss wraps around to negative, it takes INT_MAX cache misses to bring it back to normal state. During the time mmap readaround will be _enabled_ for whatever wild random workload. That's almost permanent performance impact. Signed-off-by: Wu Fengguang Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index ede84fa7da5d..8146e0264ef9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -879,7 +879,7 @@ struct file_ra_state { there are only # of pages ahead */ unsigned int ra_pages; /* Maximum readahead window */ - int mmap_miss; /* Cache miss stat for mmap accesses */ + unsigned int mmap_miss; /* Cache miss stat for mmap accesses */ loff_t prev_pos; /* Cache last read() position */ }; -- cgit v1.2.3 From d30a11004e3411909f2448546f036a011978062e Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:30 -0700 Subject: readahead: record mmap read-around states in file_ra_state Mmap read-around now shares the same code style and data structure with readahead code. This also removes do_page_cache_readahead(). Its last user, mmap read-around, has been changed to call ra_submit(). The no-readahead-if-congested logic is dumped by the way. Users will be pretty sensitive about the slow loading of executables. So it's unfavorable to disabled mmap read-around on a congested queue. [akpm@linux-foundation.org: coding-style fixes] Cc: Nick Piggin Signed-off-by: Fengguang Wu Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index ad613ed66ab0..33da7f538841 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1178,8 +1178,6 @@ void task_dirty_inc(struct task_struct *tsk); #define VM_MAX_READAHEAD 128 /* kbytes */ #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ -int do_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read); int force_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read); @@ -1197,6 +1195,9 @@ void page_cache_async_readahead(struct address_space *mapping, unsigned long size); unsigned long max_sane_readahead(unsigned long nr); +unsigned long ra_submit(struct file_ra_state *ra, + struct address_space *mapping, + struct file *filp); /* Do stack extension */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); -- cgit v1.2.3 From dc566127dd161b6c997466a2349ac179527ea89b Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:32 -0700 Subject: radix-tree: add radix_tree_prev_hole() The counterpart of radix_tree_next_hole(). To be used by context readahead. Signed-off-by: Wu Fengguang Cc: Vladislav Bolkhovitin Cc: Jens Axboe Cc: Jeff Moyer Cc: Nick Piggin Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 355f6e80db0d..c5da74918096 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -167,6 +167,8 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, unsigned long first_index, unsigned int max_items); unsigned long radix_tree_next_hole(struct radix_tree_root *root, unsigned long index, unsigned long max_scan); +unsigned long radix_tree_prev_hole(struct radix_tree_root *root, + unsigned long index, unsigned long max_scan); int radix_tree_preload(gfp_t gfp_mask); void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *root, -- cgit v1.2.3 From d2bf6be8ab63aa84e6149aac934649aadf3828b1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 16 Jun 2009 15:31:39 -0700 Subject: mm: clean up get_user_pages_fast() documentation Move more documentation for get_user_pages_fast into the new kerneldoc comment. Add some comments for get_user_pages as well. Also, move get_user_pages_fast declaration up to get_user_pages. It wasn't there initially because it was once a static inline function. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Nick Piggin Cc: Andy Grover Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 33da7f538841..a880161a3854 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -824,8 +824,11 @@ static inline int handle_mm_fault(struct mm_struct *mm, extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, - int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas); +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages); extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned long offset); @@ -849,19 +852,6 @@ extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); -/* - * get_user_pages_fast provides equivalent functionality to get_user_pages, - * operating on current and current->mm (force=0 and doesn't return any vmas). - * - * get_user_pages_fast may take mmap_sem and page tables, so no assumptions - * can be made about locking. get_user_pages_fast is to be implemented in a - * way that is advantageous (vs get_user_pages()) when the user memory area is - * already faulted in and present in ptes. However if the pages have to be - * faulted in, it may turn out to be slightly slower). - */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages); - /* * A callback you can register to apply pressure to ageable caches. * -- cgit v1.2.3 From 58568d2a8215cb6f55caf2332017d7bdff954e1c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 16 Jun 2009 15:31:49 -0700 Subject: cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Paul Menage Cc: Nick Piggin Cc: Yasunori Goto Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 13 +++++++++---- include/linux/sched.h | 8 ++++---- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 05ea1dd7d681..a5740fc4d04b 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -18,7 +18,6 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ -extern int cpuset_init_early(void); extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); @@ -27,7 +26,6 @@ extern void cpuset_cpus_allowed_locked(struct task_struct *p, extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); -void cpuset_update_task_memory_state(void); int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask); @@ -92,9 +90,13 @@ extern void rebuild_sched_domains(void); extern void cpuset_print_task_mems_allowed(struct task_struct *p); +static inline void set_mems_allowed(nodemask_t nodemask) +{ + current->mems_allowed = nodemask; +} + #else /* !CONFIG_CPUSETS */ -static inline int cpuset_init_early(void) { return 0; } static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} @@ -116,7 +118,6 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) #define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY]) static inline void cpuset_init_current_mems_allowed(void) {} -static inline void cpuset_update_task_memory_state(void) {} static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) { @@ -188,6 +189,10 @@ static inline void cpuset_print_task_mems_allowed(struct task_struct *p) { } +static inline void set_mems_allowed(nodemask_t nodemask) +{ +} + #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index c900aa530070..1048bf50540a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1318,7 +1318,8 @@ struct task_struct { /* Thread group tracking */ u32 parent_exec_id; u32 self_exec_id; -/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ +/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, + * mempolicy */ spinlock_t alloc_lock; #ifdef CONFIG_GENERIC_HARDIRQS @@ -1386,8 +1387,7 @@ struct task_struct { cputime_t acct_timexpd; /* stime + utime since last update */ #endif #ifdef CONFIG_CPUSETS - nodemask_t mems_allowed; - int cpuset_mems_generation; + nodemask_t mems_allowed; /* Protected by alloc_lock */ int cpuset_mem_spread_rotor; #endif #ifdef CONFIG_CGROUPS @@ -1410,7 +1410,7 @@ struct task_struct { struct list_head perf_counter_list; #endif #ifdef CONFIG_NUMA - struct mempolicy *mempolicy; + struct mempolicy *mempolicy; /* Protected by alloc_lock */ short il_next; #endif atomic_t fs_excl; /* holding fs exclusive resources */ -- cgit v1.2.3 From d239171e4f6efd58d7e423853056b1b6a74f1446 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:52 -0700 Subject: page allocator: replace __alloc_pages_internal() with __alloc_pages_nodemask() The start of a large patch series to clean up and optimise the page allocator. The performance improvements are in a wide range depending on the exact machine but the results I've seen so fair are approximately; kernbench: 0 to 0.12% (elapsed time) 0.49% to 3.20% (sys time) aim9: -4% to 30% (for page_test and brk_test) tbench: -1% to 4% hackbench: -2.5% to 3.45% (mostly within the noise though) netperf-udp -1.34% to 4.06% (varies between machines a bit) netperf-tcp -0.44% to 5.22% (varies between machines a bit) I haven't sysbench figures at hand, but previously they were within the -0.5% to 2% range. On netperf, the client and server were bound to opposite number CPUs to maximise the problems with cache line bouncing of the struct pages so I expect different people to report different results for netperf depending on their exact machine and how they ran the test (different machines, same cpus client/server, shared cache but two threads client/server, different socket client/server etc). I also measured the vmlinux sizes for a single x86-based config with CONFIG_DEBUG_INFO enabled but not CONFIG_DEBUG_VM. The core of the .config is based on the Debian Lenny kernel config so I expect it to be reasonably typical. This patch: __alloc_pages_internal is the core page allocator function but essentially it is an alias of __alloc_pages_nodemask. Naming a publicly available and exported function "internal" is also a big ugly. This patch renames __alloc_pages_internal() to __alloc_pages_nodemask() and deletes the old nodemask function. Warning - This patch renames an exported symbol. No kernel driver is affected by external drivers calling __alloc_pages_internal() should change the call to __alloc_pages_nodemask() without any alteration of parameters. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 3760e7c5de02..549ec5583103 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -172,24 +172,16 @@ static inline void arch_alloc_page(struct page *page, int order) { } #endif struct page * -__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask); static inline struct page * __alloc_pages(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist) { - return __alloc_pages_internal(gfp_mask, order, zonelist, NULL); + return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL); } -static inline struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) -{ - return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask); -} - - static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { -- cgit v1.2.3 From b3c466ce512923298ae8c0121d3e9f397a3f1210 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:53 -0700 Subject: page allocator: do not sanity check order in the fast path No user of the allocator API should be passing in an order >= MAX_ORDER but we check for it on each and every allocation. Delete this check and make it a VM_BUG_ON check further down the call path. [akpm@linux-foundation.org: s/VM_BUG_ON/WARN_ON_ONCE/] Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 549ec5583103..c2d3fe03b5d2 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -185,9 +185,6 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { - if (unlikely(order >= MAX_ORDER)) - return NULL; - /* Unknown node is current node */ if (nid < 0) nid = numa_node_id(); @@ -201,9 +198,6 @@ extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order); static inline struct page * alloc_pages(gfp_t gfp_mask, unsigned int order) { - if (unlikely(order >= MAX_ORDER)) - return NULL; - return alloc_pages_current(gfp_mask, order); } extern struct page *alloc_page_vma(gfp_t gfp_mask, -- cgit v1.2.3 From 6484eb3e2a81807722c5f28efef94d8338b7b996 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:54 -0700 Subject: page allocator: do not check NUMA node ID when the caller knows the node is valid Callers of alloc_pages_node() can optionally specify -1 as a node to mean "allocate from the current node". However, a number of the callers in fast paths know for a fact their node is valid. To avoid a comparison and branch, this patch adds alloc_pages_exact_node() that only checks the nid with VM_BUG_ON(). Callers that know their node is valid are then converted. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Acked-by: Paul Mundt [for the SLOB NUMA bits] Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 9 +++++++++ include/linux/mm.h | 1 - 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c2d3fe03b5d2..4efa33088a82 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -5,6 +5,7 @@ #include #include #include +#include struct vm_area_struct; @@ -192,6 +193,14 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); } +static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask, + unsigned int order) +{ + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); + + return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); +} + #ifdef CONFIG_NUMA extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order); diff --git a/include/linux/mm.h b/include/linux/mm.h index a880161a3854..7b548e7cfbd9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -7,7 +7,6 @@ #include #include -#include #include #include #include -- cgit v1.2.3 From 49255c619fbd482d704289b5eb2795f8e3b7ff2e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:58 -0700 Subject: page allocator: move check for disabled anti-fragmentation out of fastpath On low-memory systems, anti-fragmentation gets disabled as there is nothing it can do and it would just incur overhead shuffling pages between lists constantly. Currently the check is made in the free page fast path for every page. This patch moves it to a slow path. On machines with low memory, there will be small amount of additional overhead as pages get shuffled between lists but it should quickly settle. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a47c879e1304..0aa4445b0b8a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -50,9 +50,6 @@ extern int page_group_by_mobility_disabled; static inline int get_pageblock_migratetype(struct page *page) { - if (unlikely(page_group_by_mobility_disabled)) - return MIGRATE_UNMOVABLE; - return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end); } -- cgit v1.2.3 From 418589663d6011de9006425b6c5721e1544fb47a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:32:12 -0700 Subject: page allocator: use allocation flags as an index to the zone watermark ALLOC_WMARK_MIN, ALLOC_WMARK_LOW and ALLOC_WMARK_HIGH determin whether pages_min, pages_low or pages_high is used as the zone watermark when allocating the pages. Two branches in the allocator hotpath determine which watermark to use. This patch uses the flags as an array index into a watermark array that is indexed with WMARK_* defines accessed via helpers. All call sites that use zone->pages_* are updated to use the helpers for accessing the values and the array offsets for setting. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Cc: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0aa4445b0b8a..dd8487f0442f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -163,6 +163,17 @@ static inline int is_unevictable_lru(enum lru_list l) #endif } +enum zone_watermarks { + WMARK_MIN, + WMARK_LOW, + WMARK_HIGH, + NR_WMARK +}; + +#define min_wmark_pages(z) (z->watermark[WMARK_MIN]) +#define low_wmark_pages(z) (z->watermark[WMARK_LOW]) +#define high_wmark_pages(z) (z->watermark[WMARK_HIGH]) + struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ @@ -275,7 +286,10 @@ struct zone_reclaim_stat { struct zone { /* Fields commonly accessed by the page allocator */ - unsigned long pages_min, pages_low, pages_high; + + /* zone watermarks, access with *_wmark_pages(zone) macros */ + unsigned long watermark[NR_WMARK]; + /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several -- cgit v1.2.3 From 62bc62a873116805774ffd37d7f86aa4faa832b1 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 16 Jun 2009 15:32:15 -0700 Subject: page allocator: use a pre-calculated value instead of num_online_nodes() in fast paths num_online_nodes() is called in a number of places but most often by the page allocator when deciding whether the zonelist needs to be filtered based on cpusets or the zonelist cache. This is actually a heavy function and touches a number of cache lines. This patch stores the number of online nodes at boot time and updates the value when nodes get onlined and offlined. The value is then used in a number of important paths in place of num_online_nodes(). [rientjes@google.com: do not override definition of node_set_online() with macro] Signed-off-by: Christoph Lameter Signed-off-by: Mel Gorman Cc: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nodemask.h | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 848025cd7087..829b94b156f2 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -408,6 +408,19 @@ static inline int num_node_state(enum node_states state) #define next_online_node(nid) next_node((nid), node_states[N_ONLINE]) extern int nr_node_ids; +extern int nr_online_nodes; + +static inline void node_set_online(int nid) +{ + node_set_state(nid, N_ONLINE); + nr_online_nodes = num_node_state(N_ONLINE); +} + +static inline void node_set_offline(int nid) +{ + node_clear_state(nid, N_ONLINE); + nr_online_nodes = num_node_state(N_ONLINE); +} #else static inline int node_state(int node, enum node_states state) @@ -434,7 +447,10 @@ static inline int num_node_state(enum node_states state) #define first_online_node 0 #define next_online_node(nid) (MAX_NUMNODES) #define nr_node_ids 1 +#define nr_online_nodes 1 +#define node_set_online(node) node_set_state((node), N_ONLINE) +#define node_set_offline(node) node_clear_state((node), N_ONLINE) #endif #define node_online_map node_states[N_ONLINE] @@ -454,9 +470,6 @@ static inline int num_node_state(enum node_states state) #define node_online(node) node_state((node), N_ONLINE) #define node_possible(node) node_state((node), N_POSSIBLE) -#define node_set_online(node) node_set_state((node), N_ONLINE) -#define node_set_offline(node) node_clear_state((node), N_ONLINE) - #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) -- cgit v1.2.3 From 20a0307c0396c2edb651401d2f2db193dda2f3c9 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:32:22 -0700 Subject: mm: introduce PageHuge() for testing huge/gigantic pages A series of patches to enhance the /proc/pagemap interface and to add a userspace executable which can be used to present the pagemap data. Export 10 more flags to end users (and more for kernel developers): 11. KPF_MMAP (pseudo flag) memory mapped page 12. KPF_ANON (pseudo flag) memory mapped page (anonymous) 13. KPF_SWAPCACHE page is in swap cache 14. KPF_SWAPBACKED page is swap/RAM backed 15. KPF_COMPOUND_HEAD (*) 16. KPF_COMPOUND_TAIL (*) 17. KPF_HUGE hugeTLB pages 18. KPF_UNEVICTABLE page is in the unevictable LRU list 19. KPF_HWPOISON hardware detected corruption 20. KPF_NOPAGE (pseudo flag) no page frame at the address (*) For compound pages, exporting _both_ head/tail info enables users to tell where a compound page starts/ends, and its order. a simple demo of the page-types tool # ./page-types -h page-types [options] -r|--raw Raw mode, for kernel developers -a|--addr addr-spec Walk a range of pages -b|--bits bits-spec Walk pages with specified bits -l|--list Show page details in ranges -L|--list-each Show page details one by one -N|--no-summary Don't show summay info -h|--help Show this usage message addr-spec: N one page at offset N (unit: pages) N+M pages range from N to N+M-1 N,M pages range from N to M-1 N, pages range from N to end ,M pages range from 0 to M bits-spec: bit1,bit2 (flags & (bit1|bit2)) != 0 bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1 bit1,~bit2 (flags & (bit1|bit2)) == bit1 =bit1,bit2 flags == (bit1|bit2) bit-names: locked error referenced uptodate dirty lru active slab writeback reclaim buddy mmap anonymous swapcache swapbacked compound_head compound_tail huge unevictable hwpoison nopage reserved(r) mlocked(r) mappedtodisk(r) private(r) private_2(r) owner_private(r) arch(r) uncached(r) readahead(o) slob_free(o) slub_frozen(o) slub_debug(o) (r) raw mode bits (o) overloaded bits # ./page-types flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000000 487369 1903 _________________________________ 0x0000000000000014 5 0 __R_D____________________________ referenced,dirty 0x0000000000000020 1 0 _____l___________________________ lru 0x0000000000000024 34 0 __R__l___________________________ referenced,lru 0x0000000000000028 3838 14 ___U_l___________________________ uptodate,lru 0x0001000000000028 48 0 ___U_l_______________________I___ uptodate,lru,readahead 0x000000000000002c 6478 25 __RU_l___________________________ referenced,uptodate,lru 0x000100000000002c 47 0 __RU_l_______________________I___ referenced,uptodate,lru,readahead 0x0000000000000040 8344 32 ______A__________________________ active 0x0000000000000060 1 0 _____lA__________________________ lru,active 0x0000000000000068 348 1 ___U_lA__________________________ uptodate,lru,active 0x0001000000000068 12 0 ___U_lA______________________I___ uptodate,lru,active,readahead 0x000000000000006c 988 3 __RU_lA__________________________ referenced,uptodate,lru,active 0x000100000000006c 48 0 __RU_lA______________________I___ referenced,uptodate,lru,active,readahead 0x0000000000004078 1 0 ___UDlA_______b__________________ uptodate,dirty,lru,active,swapbacked 0x000000000000407c 34 0 __RUDlA_______b__________________ referenced,uptodate,dirty,lru,active,swapbacked 0x0000000000000400 503 1 __________B______________________ buddy 0x0000000000000804 1 0 __R________M_____________________ referenced,mmap 0x0000000000000828 1029 4 ___U_l_____M_____________________ uptodate,lru,mmap 0x0001000000000828 43 0 ___U_l_____M_________________I___ uptodate,lru,mmap,readahead 0x000000000000082c 382 1 __RU_l_____M_____________________ referenced,uptodate,lru,mmap 0x000100000000082c 12 0 __RU_l_____M_________________I___ referenced,uptodate,lru,mmap,readahead 0x0000000000000868 192 0 ___U_lA____M_____________________ uptodate,lru,active,mmap 0x0001000000000868 12 0 ___U_lA____M_________________I___ uptodate,lru,active,mmap,readahead 0x000000000000086c 800 3 __RU_lA____M_____________________ referenced,uptodate,lru,active,mmap 0x000100000000086c 31 0 __RU_lA____M_________________I___ referenced,uptodate,lru,active,mmap,readahead 0x0000000000004878 2 0 ___UDlA____M__b__________________ uptodate,dirty,lru,active,mmap,swapbacked 0x0000000000001000 492 1 ____________a____________________ anonymous 0x0000000000005808 4 0 ___U_______Ma_b__________________ uptodate,mmap,anonymous,swapbacked 0x0000000000005868 2839 11 ___U_lA____Ma_b__________________ uptodate,lru,active,mmap,anonymous,swapbacked 0x000000000000586c 30 0 __RU_lA____Ma_b__________________ referenced,uptodate,lru,active,mmap,anonymous,swapbacked total 513968 2007 # ./page-types -r flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000000 468002 1828 _________________________________ 0x0000000100000000 19102 74 _____________________r___________ reserved 0x0000000000008000 41 0 _______________H_________________ compound_head 0x0000000000010000 188 0 ________________T________________ compound_tail 0x0000000000008014 1 0 __R_D__________H_________________ referenced,dirty,compound_head 0x0000000000010014 4 0 __R_D___________T________________ referenced,dirty,compound_tail 0x0000000000000020 1 0 _____l___________________________ lru 0x0000000800000024 34 0 __R__l__________________P________ referenced,lru,private 0x0000000000000028 3794 14 ___U_l___________________________ uptodate,lru 0x0001000000000028 46 0 ___U_l_______________________I___ uptodate,lru,readahead 0x0000000400000028 44 0 ___U_l_________________d_________ uptodate,lru,mappedtodisk 0x0001000400000028 2 0 ___U_l_________________d_____I___ uptodate,lru,mappedtodisk,readahead 0x000000000000002c 6434 25 __RU_l___________________________ referenced,uptodate,lru 0x000100000000002c 47 0 __RU_l_______________________I___ referenced,uptodate,lru,readahead 0x000000040000002c 14 0 __RU_l_________________d_________ referenced,uptodate,lru,mappedtodisk 0x000000080000002c 30 0 __RU_l__________________P________ referenced,uptodate,lru,private 0x0000000800000040 8124 31 ______A_________________P________ active,private 0x0000000000000040 219 0 ______A__________________________ active 0x0000000800000060 1 0 _____lA_________________P________ lru,active,private 0x0000000000000068 322 1 ___U_lA__________________________ uptodate,lru,active 0x0001000000000068 12 0 ___U_lA______________________I___ uptodate,lru,active,readahead 0x0000000400000068 13 0 ___U_lA________________d_________ uptodate,lru,active,mappedtodisk 0x0000000800000068 12 0 ___U_lA_________________P________ uptodate,lru,active,private 0x000000000000006c 977 3 __RU_lA__________________________ referenced,uptodate,lru,active 0x000100000000006c 48 0 __RU_lA______________________I___ referenced,uptodate,lru,active,readahead 0x000000040000006c 5 0 __RU_lA________________d_________ referenced,uptodate,lru,active,mappedtodisk 0x000000080000006c 3 0 __RU_lA_________________P________ referenced,uptodate,lru,active,private 0x0000000c0000006c 3 0 __RU_lA________________dP________ referenced,uptodate,lru,active,mappedtodisk,private 0x0000000c00000068 1 0 ___U_lA________________dP________ uptodate,lru,active,mappedtodisk,private 0x0000000000004078 1 0 ___UDlA_______b__________________ uptodate,dirty,lru,active,swapbacked 0x000000000000407c 34 0 __RUDlA_______b__________________ referenced,uptodate,dirty,lru,active,swapbacked 0x0000000000000400 538 2 __________B______________________ buddy 0x0000000000000804 1 0 __R________M_____________________ referenced,mmap 0x0000000000000828 1029 4 ___U_l_____M_____________________ uptodate,lru,mmap 0x0001000000000828 43 0 ___U_l_____M_________________I___ uptodate,lru,mmap,readahead 0x000000000000082c 382 1 __RU_l_____M_____________________ referenced,uptodate,lru,mmap 0x000100000000082c 12 0 __RU_l_____M_________________I___ referenced,uptodate,lru,mmap,readahead 0x0000000000000868 192 0 ___U_lA____M_____________________ uptodate,lru,active,mmap 0x0001000000000868 12 0 ___U_lA____M_________________I___ uptodate,lru,active,mmap,readahead 0x000000000000086c 800 3 __RU_lA____M_____________________ referenced,uptodate,lru,active,mmap 0x000100000000086c 31 0 __RU_lA____M_________________I___ referenced,uptodate,lru,active,mmap,readahead 0x0000000000004878 2 0 ___UDlA____M__b__________________ uptodate,dirty,lru,active,mmap,swapbacked 0x0000000000001000 492 1 ____________a____________________ anonymous 0x0000000000005008 2 0 ___U________a_b__________________ uptodate,anonymous,swapbacked 0x0000000000005808 4 0 ___U_______Ma_b__________________ uptodate,mmap,anonymous,swapbacked 0x000000000000580c 1 0 __RU_______Ma_b__________________ referenced,uptodate,mmap,anonymous,swapbacked 0x0000000000005868 2839 11 ___U_lA____Ma_b__________________ uptodate,lru,active,mmap,anonymous,swapbacked 0x000000000000586c 29 0 __RU_lA____Ma_b__________________ referenced,uptodate,lru,active,mmap,anonymous,swapbacked total 513968 2007 # ./page-types --raw --list --no-summary --bits reserved offset count flags 0 15 _____________________r___________ 31 4 _____________________r___________ 159 97 _____________________r___________ 4096 2067 _____________________r___________ 6752 2390 _____________________r___________ 9355 3 _____________________r___________ 9728 14526 _____________________r___________ This patch: Introduce PageHuge(), which identifies huge/gigantic pages by their dedicated compound destructor functions. Also move prep_compound_gigantic_page() to hugetlb.c and make __free_pages_ok() non-static. Signed-off-by: Wu Fengguang Cc: KOSAKI Motohiro Cc: Andi Kleen Cc: Matt Mackall Cc: Alexey Dobriyan Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 03be7f29ca01..a05a5ef33391 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -11,6 +11,8 @@ struct ctl_table; +int PageHuge(struct page *page); + static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) { return vma->vm_flags & VM_HUGETLB; @@ -61,6 +63,11 @@ void hugetlb_change_protection(struct vm_area_struct *vma, #else /* !CONFIG_HUGETLB_PAGE */ +static inline int PageHuge(struct page *page) +{ + return 0; +} + static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) { return 0; -- cgit v1.2.3 From 56e49d218890f49b0057710a4b6fef31f5ffbfec Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 16 Jun 2009 15:32:28 -0700 Subject: vmscan: evict use-once pages first When the file LRU lists are dominated by streaming IO pages, evict those pages first, before considering evicting other pages. This should be safe from deadlocks or performance problems because only three things can happen to an inactive file page: 1) referenced twice and promoted to the active list 2) evicted by the pageout code 3) under IO, after which it will get evicted or promoted The pages freed in this way can either be reused for streaming IO, or allocated for something else. If the pages are used for streaming IO, this pageout pattern continues. Otherwise, we will fall back to the normal pageout pattern. Signed-off-by: Rik van Riel Reported-by: Elladan Cc: KOSAKI Motohiro Cc: Peter Zijlstra Cc: Lee Schermerhorn Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 25b9ca93d232..45add35dda1b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -94,6 +94,7 @@ extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority); int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg); +int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg); unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone, enum lru_list lru); @@ -239,6 +240,12 @@ mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) return 1; } +static inline int +mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) +{ + return 1; +} + static inline unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone, enum lru_list lru) -- cgit v1.2.3 From 6e08a369ee10b361ac1cdcdf4fabd420fd08beb3 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:32:29 -0700 Subject: vmscan: cleanup the scan batching code The vmscan batching logic is twisting. Move it into a standalone function nr_scan_try_batch() and document it. No behavior change. Signed-off-by: Wu Fengguang Acked-by: Rik van Riel Cc: Nick Piggin Cc: Christoph Lameter Acked-by: Peter Zijlstra Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index dd8487f0442f..db976b9f8791 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -334,9 +334,9 @@ struct zone { /* Fields commonly accessed by the page reclaim scanner */ spinlock_t lru_lock; - struct { + struct zone_lru { struct list_head list; - unsigned long nr_scan; + unsigned long nr_saved_scan; /* accumulated for batching */ } lru[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; -- cgit v1.2.3 From 3b6748e2dd69906af3835db4dc9d1c8a3ee4c68c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 16 Jun 2009 15:32:35 -0700 Subject: mm: introduce follow_pfn() Analoguous to follow_phys(), add a helper that looks up the PFN at a user virtual address in an IO mapping or a raw PFN mapping. Signed-off-by: Johannes Weiner Cc: Christoph Hellwig Acked-by: Magnus Damm Cc: Hans Verkuil Cc: Paul Mundt Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b548e7cfbd9..c80dbd4a62c6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -792,6 +792,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned long *prot, resource_size_t *phys); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, -- cgit v1.2.3 From 7f33d49a2ed546e01f7b1d0607661810f2421859 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 16 Jun 2009 15:32:41 -0700 Subject: mm, PM/Freezer: Disable OOM killer when tasks are frozen Currently, the following scenario appears to be possible in theory: * Tasks are frozen for hibernation or suspend. * Free pages are almost exhausted. * Certain piece of code in the suspend code path attempts to allocate some memory using GFP_KERNEL and allocation order less than or equal to PAGE_ALLOC_COSTLY_ORDER. * __alloc_pages_internal() cannot find a free page so it invokes the OOM killer. * The OOM killer attempts to kill a task, but the task is frozen, so it doesn't die immediately. * __alloc_pages_internal() jumps to 'restart', unsuccessfully tries to find a free page and invokes the OOM killer. * No progress can be made. Although it is now hard to trigger during hibernation due to the memory shrinking carried out by the hibernation code, it is theoretically possible to trigger during suspend after the memory shrinking has been removed from that code path. Moreover, since memory allocations are going to be used for the hibernation memory shrinking, it will be even more likely to happen during hibernation. To prevent it from happening, introduce the oom_killer_disabled switch that will cause __alloc_pages_internal() to fail in the situations in which the OOM killer would have been called and make the freezer set this switch after tasks have been successfully frozen. [akpm@linux-foundation.org: be nicer to the namespace] Signed-off-by: Rafael J. Wysocki Cc: Fengguang Wu Cc: David Rientjes Acked-by: Pavel Machek Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 4efa33088a82..06b7e8cc80ac 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -243,4 +243,16 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(void); void drain_local_pages(void *dummy); +extern bool oom_killer_disabled; + +static inline void oom_killer_disable(void) +{ + oom_killer_disabled = true; +} + +static inline void oom_killer_enable(void) +{ + oom_killer_disabled = false; +} + #endif /* __LINUX_GFP_H */ -- cgit v1.2.3 From 31c911329e048b715a1dfeaaf617be9430fd7f4e Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 16 Jun 2009 15:32:45 -0700 Subject: mm: check the argument of kunmap on architectures without highmem If you're using a non-highmem architecture, passing an argument with the wrong type to kunmap() doesn't give you a warning because the ifdef doesn't check the type. Using a static inline function solves the problem nicely. Reported-by: David Woodhouse Signed-off-by: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 1fcb7126a01f..211ff4497269 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -55,7 +55,9 @@ static inline void *kmap(struct page *page) return page_address(page); } -#define kunmap(page) do { (void) (page); } while (0) +static inline void kunmap(struct page *page) +{ +} static inline void *kmap_atomic(struct page *page, enum km_type idx) { -- cgit v1.2.3 From b70d94ee438b3fd9c15c7691d7a932a135c18101 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 16 Jun 2009 15:32:46 -0700 Subject: page-allocator: use integer fields lookup for gfp_zone and check for errors in flags passed to the page allocator This simplifies the code in gfp_zone() and also keeps the ability of the compiler to use constant folding to get rid of gfp_zone processing. The lookup of the zone is done using a bitfield stored in an integer. So the code in gfp_zone is a simple extraction of bits from a constant bitfield. The compiler is generating a load of a constant into a register and then performs a shift and mask operation to get the zone from a gfp_t. No cachelines are touched and no branches have to be predicted by the compiler. We are doing some macro tricks here to convince the compiler to always do the constant folding if possible. Signed-off-by: Christoph Lameter Cc: KAMEZAWA Hiroyuki Reviewed-by: Mel Gorman Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 111 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 96 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 06b7e8cc80ac..412178afd423 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -21,7 +21,8 @@ struct vm_area_struct; #define __GFP_DMA ((__force gfp_t)0x01u) #define __GFP_HIGHMEM ((__force gfp_t)0x02u) #define __GFP_DMA32 ((__force gfp_t)0x04u) - +#define __GFP_MOVABLE ((__force gfp_t)0x08u) /* Page is movable */ +#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE) /* * Action modifiers - doesn't change the zoning * @@ -51,7 +52,6 @@ struct vm_area_struct; #define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */ #define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */ #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */ -#define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */ #define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) @@ -116,24 +116,105 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags) ((gfp_flags & __GFP_RECLAIMABLE) != 0); } -static inline enum zone_type gfp_zone(gfp_t flags) -{ +#ifdef CONFIG_HIGHMEM +#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM +#else +#define OPT_ZONE_HIGHMEM ZONE_NORMAL +#endif + #ifdef CONFIG_ZONE_DMA - if (flags & __GFP_DMA) - return ZONE_DMA; +#define OPT_ZONE_DMA ZONE_DMA +#else +#define OPT_ZONE_DMA ZONE_NORMAL #endif + #ifdef CONFIG_ZONE_DMA32 - if (flags & __GFP_DMA32) - return ZONE_DMA32; +#define OPT_ZONE_DMA32 ZONE_DMA32 +#else +#define OPT_ZONE_DMA32 ZONE_NORMAL #endif - if ((flags & (__GFP_HIGHMEM | __GFP_MOVABLE)) == - (__GFP_HIGHMEM | __GFP_MOVABLE)) - return ZONE_MOVABLE; -#ifdef CONFIG_HIGHMEM - if (flags & __GFP_HIGHMEM) - return ZONE_HIGHMEM; + +/* + * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the + * zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long + * and there are 16 of them to cover all possible combinations of + * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM + * + * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA. + * But GFP_MOVABLE is not only a zone specifier but also an allocation + * policy. Therefore __GFP_MOVABLE plus another zone selector is valid. + * Only 1bit of the lowest 3 bit (DMA,DMA32,HIGHMEM) can be set to "1". + * + * bit result + * ================= + * 0x0 => NORMAL + * 0x1 => DMA or NORMAL + * 0x2 => HIGHMEM or NORMAL + * 0x3 => BAD (DMA+HIGHMEM) + * 0x4 => DMA32 or DMA or NORMAL + * 0x5 => BAD (DMA+DMA32) + * 0x6 => BAD (HIGHMEM+DMA32) + * 0x7 => BAD (HIGHMEM+DMA32+DMA) + * 0x8 => NORMAL (MOVABLE+0) + * 0x9 => DMA or NORMAL (MOVABLE+DMA) + * 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too) + * 0xb => BAD (MOVABLE+HIGHMEM+DMA) + * 0xc => DMA32 (MOVABLE+HIGHMEM+DMA32) + * 0xd => BAD (MOVABLE+DMA32+DMA) + * 0xe => BAD (MOVABLE+DMA32+HIGHMEM) + * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA) + * + * ZONES_SHIFT must be <= 2 on 32 bit platforms. + */ + +#if 16 * ZONES_SHIFT > BITS_PER_LONG +#error ZONES_SHIFT too large to create GFP_ZONE_TABLE integer +#endif + +#define GFP_ZONE_TABLE ( \ + (ZONE_NORMAL << 0 * ZONES_SHIFT) \ + | (OPT_ZONE_DMA << __GFP_DMA * ZONES_SHIFT) \ + | (OPT_ZONE_HIGHMEM << __GFP_HIGHMEM * ZONES_SHIFT) \ + | (OPT_ZONE_DMA32 << __GFP_DMA32 * ZONES_SHIFT) \ + | (ZONE_NORMAL << __GFP_MOVABLE * ZONES_SHIFT) \ + | (OPT_ZONE_DMA << (__GFP_MOVABLE | __GFP_DMA) * ZONES_SHIFT) \ + | (ZONE_MOVABLE << (__GFP_MOVABLE | __GFP_HIGHMEM) * ZONES_SHIFT)\ + | (OPT_ZONE_DMA32 << (__GFP_MOVABLE | __GFP_DMA32) * ZONES_SHIFT)\ +) + +/* + * GFP_ZONE_BAD is a bitmap for all combination of __GFP_DMA, __GFP_DMA32 + * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per + * entry starting with bit 0. Bit is set if the combination is not + * allowed. + */ +#define GFP_ZONE_BAD ( \ + 1 << (__GFP_DMA | __GFP_HIGHMEM) \ + | 1 << (__GFP_DMA | __GFP_DMA32) \ + | 1 << (__GFP_DMA32 | __GFP_HIGHMEM) \ + | 1 << (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM) \ + | 1 << (__GFP_MOVABLE | __GFP_HIGHMEM | __GFP_DMA) \ + | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA) \ + | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_HIGHMEM) \ + | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA | __GFP_HIGHMEM)\ +) + +static inline enum zone_type gfp_zone(gfp_t flags) +{ + enum zone_type z; + int bit = flags & GFP_ZONEMASK; + + z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) & + ((1 << ZONES_SHIFT) - 1); + + if (__builtin_constant_p(bit)) + BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1); + else { +#ifdef CONFIG_DEBUG_VM + BUG_ON((GFP_ZONE_BAD >> bit) & 1); #endif - return ZONE_NORMAL; + } + return z; } /* -- cgit v1.2.3 From bc75d33f0fc1d56e734db1f56d3cfc8097b8e0cf Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 16 Jun 2009 15:32:48 -0700 Subject: page-allocator: clean up functions related to pages_min Change the names of two functions. It doesn't affect behavior. Presently, setup_per_zone_pages_min() changes low, high of zone as well as min. So a better name is setup_per_zone_wmarks(). That's because Mel changed zone->pages_[hig/low/min] to zone->watermark array in "page allocator: replace the watermark-related union in struct zone with a watermark[] array". * setup_per_zone_pages_min => setup_per_zone_wmarks Of course, we have to change init_per_zone_pages_min, too. There are not pages_min any more. * init_per_zone_pages_min => init_per_zone_wmark_min [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Minchan Kim Acked-by: Mel Gorman Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index c80dbd4a62c6..6e11cda1ba02 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1052,7 +1052,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn); extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context); -extern void setup_per_zone_pages_min(void); +extern void setup_per_zone_wmarks(void); extern void mem_init(void); extern void __init mmap_init(void); extern void show_mem(void); -- cgit v1.2.3 From 96cb4df5ddf5e6d5785b5acd4003e3689b87f896 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 16 Jun 2009 15:32:49 -0700 Subject: page-allocator: add inactive ratio calculation function of each zone Factor the per-zone arithemetic inside setup_per_zone_inactive_ratio()'s loop into a a separate function, calculate_zone_inactive_ratio(). This function will be used in a later patch [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Minchan Kim Reviewed-by: KOSAKI Motohiro Cc: Rik van Riel Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6e11cda1ba02..f0473a88ca2e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1053,6 +1053,7 @@ extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context); extern void setup_per_zone_wmarks(void); +extern void calculate_zone_inactive_ratio(struct zone *zone); extern void mem_init(void); extern void __init mmap_init(void); extern void show_mem(void); -- cgit v1.2.3 From 6837765963f1723e80ca97b1fae660f3a60d77df Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 16 Jun 2009 15:32:51 -0700 Subject: mm: remove CONFIG_UNEVICTABLE_LRU config option Currently, nobody wants to turn UNEVICTABLE_LRU off. Thus this configurability is unnecessary. Signed-off-by: KOSAKI Motohiro Cc: Johannes Weiner Cc: Andi Kleen Acked-by: Minchan Kim Cc: David Woodhouse Cc: Matt Mackall Cc: Rik van Riel Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 13 ------------- include/linux/page-flags.h | 16 +--------------- include/linux/pagemap.h | 12 ------------ include/linux/rmap.h | 7 ------- include/linux/swap.h | 19 ------------------- include/linux/vmstat.h | 2 -- 6 files changed, 1 insertion(+), 68 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index db976b9f8791..889598537370 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -83,13 +83,8 @@ enum zone_stat_item { NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ -#ifdef CONFIG_UNEVICTABLE_LRU NR_UNEVICTABLE, /* " " " " " */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ -#else - NR_UNEVICTABLE = NR_ACTIVE_FILE, /* avoid compiler errors in dead code */ - NR_MLOCK = NR_ACTIVE_FILE, -#endif NR_ANON_PAGES, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. only modified from process context */ @@ -132,11 +127,7 @@ enum lru_list { LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, -#ifdef CONFIG_UNEVICTABLE_LRU LRU_UNEVICTABLE, -#else - LRU_UNEVICTABLE = LRU_ACTIVE_FILE, /* avoid compiler errors in dead code */ -#endif NR_LRU_LISTS }; @@ -156,11 +147,7 @@ static inline int is_active_lru(enum lru_list l) static inline int is_unevictable_lru(enum lru_list l) { -#ifdef CONFIG_UNEVICTABLE_LRU return (l == LRU_UNEVICTABLE); -#else - return 0; -#endif } enum zone_watermarks { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 62214c7d2d93..d6792f88a176 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -95,9 +95,7 @@ enum pageflags { PG_reclaim, /* To be reclaimed asap */ PG_buddy, /* Page is free, on buddy lists */ PG_swapbacked, /* Page is backed by RAM/swap */ -#ifdef CONFIG_UNEVICTABLE_LRU PG_unevictable, /* Page is "unevictable" */ -#endif #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT PG_mlocked, /* Page is vma mlocked */ #endif @@ -248,14 +246,8 @@ PAGEFLAG_FALSE(SwapCache) SETPAGEFLAG_NOOP(SwapCache) CLEARPAGEFLAG_NOOP(SwapCache) #endif -#ifdef CONFIG_UNEVICTABLE_LRU PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) TESTCLEARFLAG(Unevictable, unevictable) -#else -PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable) - SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable) - __CLEARPAGEFLAG_NOOP(Unevictable) -#endif #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT #define MLOCK_PAGES 1 @@ -382,12 +374,6 @@ static inline void __ClearPageTail(struct page *page) #endif /* !PAGEFLAGS_EXTENDED */ -#ifdef CONFIG_UNEVICTABLE_LRU -#define __PG_UNEVICTABLE (1 << PG_unevictable) -#else -#define __PG_UNEVICTABLE 0 -#endif - #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT #define __PG_MLOCKED (1 << PG_mlocked) #else @@ -403,7 +389,7 @@ static inline void __ClearPageTail(struct page *page) 1 << PG_private | 1 << PG_private_2 | \ 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - __PG_UNEVICTABLE | __PG_MLOCKED) + 1 << PG_unevictable | __PG_MLOCKED) /* * Flags checked when a page is prepped for return by the page allocator. diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 34da5230faab..aec3252afcf5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -22,9 +22,7 @@ enum mapping_flags { AS_EIO = __GFP_BITS_SHIFT + 0, /* IO error on async write */ AS_ENOSPC = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */ AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */ -#ifdef CONFIG_UNEVICTABLE_LRU AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */ -#endif }; static inline void mapping_set_error(struct address_space *mapping, int error) @@ -37,8 +35,6 @@ static inline void mapping_set_error(struct address_space *mapping, int error) } } -#ifdef CONFIG_UNEVICTABLE_LRU - static inline void mapping_set_unevictable(struct address_space *mapping) { set_bit(AS_UNEVICTABLE, &mapping->flags); @@ -55,14 +51,6 @@ static inline int mapping_unevictable(struct address_space *mapping) return test_bit(AS_UNEVICTABLE, &mapping->flags); return !!mapping; } -#else -static inline void mapping_set_unevictable(struct address_space *mapping) { } -static inline void mapping_clear_unevictable(struct address_space *mapping) { } -static inline int mapping_unevictable(struct address_space *mapping) -{ - return 0; -} -#endif static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b35bc0e19cd9..619379a1dd98 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -105,18 +105,11 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); */ int page_mkclean(struct page *); -#ifdef CONFIG_UNEVICTABLE_LRU /* * called in munlock()/munmap() path to check for other vmas holding * the page mlocked. */ int try_to_munlock(struct page *); -#else -static inline int try_to_munlock(struct page *page) -{ - return 0; /* a.k.a. SWAP_SUCCESS */ -} -#endif #else /* !CONFIG_MMU */ diff --git a/include/linux/swap.h b/include/linux/swap.h index d476aad3ff57..f30c06908f09 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -235,7 +235,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) } #endif -#ifdef CONFIG_UNEVICTABLE_LRU extern int page_evictable(struct page *page, struct vm_area_struct *vma); extern void scan_mapping_unevictable_pages(struct address_space *); @@ -244,24 +243,6 @@ extern int scan_unevictable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); extern int scan_unevictable_register_node(struct node *node); extern void scan_unevictable_unregister_node(struct node *node); -#else -static inline int page_evictable(struct page *page, - struct vm_area_struct *vma) -{ - return 1; -} - -static inline void scan_mapping_unevictable_pages(struct address_space *mapping) -{ -} - -static inline int scan_unevictable_register_node(struct node *node) -{ - return 0; -} - -static inline void scan_unevictable_unregister_node(struct node *node) { } -#endif extern int kswapd_run(int nid); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 524cd1b28ecb..ff4696c6dce3 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -41,7 +41,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, #endif -#ifdef CONFIG_UNEVICTABLE_LRU UNEVICTABLE_PGCULLED, /* culled to noreclaim list */ UNEVICTABLE_PGSCANNED, /* scanned for reclaimability */ UNEVICTABLE_PGRESCUED, /* rescued from noreclaim list */ @@ -50,7 +49,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, UNEVICTABLE_PGCLEARED, /* on COW, page truncate */ UNEVICTABLE_PGSTRANDED, /* unable to isolate on unlock */ UNEVICTABLE_MLOCKFREED, -#endif NR_VM_EVENT_ITEMS }; -- cgit v1.2.3 From cb4b86ba47bb0937b71fb825b3ed88adf7a190f0 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 16 Jun 2009 15:32:52 -0700 Subject: mm: add swap cache interface for swap reference In a following patch, the usage of swap cache is recorded into swap_map. This patch is for necessary interface changes to do that. 2 interfaces: - swapcache_prepare() - swapcache_free() are added for allocating/freeing refcnt from swap-cache to existing swap entries. But implementation itself is not changed under this patch. At adding swapcache_free(), memcg's hook code is moved under swapcache_free(). This is better than using scattered hooks. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Daisuke Nishimura Acked-by: Balbir Singh Cc: Hugh Dickins Cc: Johannes Weiner Cc: Li Zefan Cc: Dhaval Giani Cc: YAMAMOTO Takashi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index f30c06908f09..259e96c150ef 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -282,8 +282,10 @@ extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); extern swp_entry_t get_swap_page_of_type(int); extern int swap_duplicate(swp_entry_t); +extern int swapcache_prepare(swp_entry_t); extern int valid_swaphandles(swp_entry_t, unsigned long *); extern void swap_free(swp_entry_t); +extern void swapcache_free(swp_entry_t, struct page *page); extern int free_swap_and_cache(swp_entry_t); extern int swap_type_of(dev_t, sector_t, struct block_device **); extern unsigned int count_swap_pages(int, int); @@ -352,11 +354,16 @@ static inline void show_swap_cache_info(void) #define free_swap_and_cache(swp) is_migration_entry(swp) #define swap_duplicate(swp) is_migration_entry(swp) +#define swapcache_prepare(swp) is_migration_entry(swp) static inline void swap_free(swp_entry_t swp) { } +static inline void swapcache_free(swp_entry_t swp, struct page *page) +{ +} + static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { -- cgit v1.2.3 From 355cfa73ddff2fb8fa14e93bd94a057cc022512e Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 16 Jun 2009 15:32:53 -0700 Subject: mm: modify swap_map and add SWAP_HAS_CACHE flag This is a part of the patches for fixing memcg's swap accountinf leak. But, IMHO, not a bad patch even if no memcg. There are 2 kinds of references to swap. - reference from swap entry - reference from swap cache Then, - If there is swap cache && swap's refcnt is 1, there is only swap cache. (*) swapcount(entry) == 1 && find_get_page(swapper_space, entry) != NULL This counting logic have worked well for a long time. But considering that we cannot know there is a _real_ reference or not by swap_map[], current usage of counter is not very good. This patch adds a flag SWAP_HAS_CACHE and recored information that a swap entry has a cache or not. This will remove -1 magic used in swapfile.c and be a help to avoid unnecessary find_get_page(). Signed-off-by: KAMEZAWA Hiroyuki Tested-by: Daisuke Nishimura Cc: Balbir Singh Cc: Hugh Dickins Cc: Johannes Weiner Cc: Li Zefan Cc: Dhaval Giani Cc: YAMAMOTO Takashi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 259e96c150ef..fed5e8e1104b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -129,9 +129,10 @@ enum { #define SWAP_CLUSTER_MAX 32 -#define SWAP_MAP_MAX 0x7fff -#define SWAP_MAP_BAD 0x8000 - +#define SWAP_MAP_MAX 0x7ffe +#define SWAP_MAP_BAD 0x7fff +#define SWAP_HAS_CACHE 0x8000 /* There is a swap cache of entry. */ +#define SWAP_COUNT_MASK (~SWAP_HAS_CACHE) /* * The in-memory structure used to track swap areas. */ @@ -281,7 +282,7 @@ extern long total_swap_pages; extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); extern swp_entry_t get_swap_page_of_type(int); -extern int swap_duplicate(swp_entry_t); +extern void swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); extern int valid_swaphandles(swp_entry_t, unsigned long *); extern void swap_free(swp_entry_t); @@ -353,9 +354,12 @@ static inline void show_swap_cache_info(void) } #define free_swap_and_cache(swp) is_migration_entry(swp) -#define swap_duplicate(swp) is_migration_entry(swp) #define swapcache_prepare(swp) is_migration_entry(swp) +static inline void swap_duplicate(swp_entry_t swp) +{ +} + static inline void swap_free(swp_entry_t swp) { } -- cgit v1.2.3 From 2ff05b2b4eac2e63d345fc731ea151a060247f53 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 16 Jun 2009 15:32:56 -0700 Subject: oom: move oom_adj value from task_struct to mm_struct The per-task oom_adj value is a characteristic of its mm more than the task itself since it's not possible to oom kill any thread that shares the mm. If a task were to be killed while attached to an mm that could not be freed because another thread were set to OOM_DISABLE, it would have needlessly been terminated since there is no potential for future memory freeing. This patch moves oomkilladj (now more appropriately named oom_adj) from struct task_struct to struct mm_struct. This requires task_lock() on a task to check its oom_adj value to protect against exec, but it's already necessary to take the lock when dereferencing the mm to find the total VM size for the badness heuristic. This fixes a livelock if the oom killer chooses a task and another thread sharing the same memory has an oom_adj value of OOM_DISABLE. This occurs because oom_kill_task() repeatedly returns 1 and refuses to kill the chosen task while select_bad_process() will repeatedly choose the same task during the next retry. Taking task_lock() in select_bad_process() to check for OOM_DISABLE and in oom_kill_task() to check for threads sharing the same memory will be removed in the next patch in this series where it will no longer be necessary. Writing to /proc/pid/oom_adj for a kthread will now return -EINVAL since these threads are immune from oom killing already. They simply report an oom_adj value of OOM_DISABLE. Cc: Nick Piggin Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 ++ include/linux/sched.h | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0e80e26ecf21..f4408106fcbc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -232,6 +232,8 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ + s8 oom_adj; /* OOM kill score adjustment (bit shift) */ + cpumask_t cpu_vm_mask; /* Architecture-specific MM context */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 1048bf50540a..1bc6fae0c135 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1178,7 +1178,6 @@ struct task_struct { * a short time */ unsigned char fpu_counter; - s8 oomkilladj; /* OOM kill score adjustment (bit shift). */ #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif -- cgit v1.2.3 From 286973552f051404abdb58dd9b2f8f7558efe4e5 Mon Sep 17 00:00:00 2001 From: Mike Waychison Date: Tue, 16 Jun 2009 15:32:59 -0700 Subject: mm: remove __invalidate_mapping_pages variant Remove __invalidate_mapping_pages atomic variant now that its sole caller can sleep (fixed in eccb95cee4f0d56faa46ef22fb94dd4a3578d3eb ("vfs: fix lock inversion in drop_pagecache_sb()")). This fixes softlockups that can occur while in the drop_caches path. Signed-off-by: Mike Waychison Cc: Jan Kara Cc: Wu Fengguang Cc: Dave Chinner Cc: Nick Piggin Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8146e0264ef9..f5ae9f19b8a3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2036,9 +2036,6 @@ extern int __invalidate_device(struct block_device *); extern int invalidate_partition(struct gendisk *, int); #endif extern int invalidate_inodes(struct super_block *); -unsigned long __invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end, - bool be_atomic); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); -- cgit v1.2.3 From aca8bf323edd31ad462dc98c107c23a5c6022ca2 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 16 Jun 2009 15:33:02 -0700 Subject: mm: remove file argument from swap_readpage() The file argument resulted from address_space's readpage long time ago. We don't use it any more. Let's remove unnecessary argement. Signed-off-by: Minchan Kim Acked-by: Hugh Dickins Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index fed5e8e1104b..0cedf31af0b0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -256,7 +256,7 @@ extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *); #ifdef CONFIG_SWAP /* linux/mm/page_io.c */ -extern int swap_readpage(struct file *, struct page *); +extern int swap_readpage(struct page *); extern int swap_writepage(struct page *page, struct writeback_control *wbc); extern void end_swap_bio_read(struct bio *bio, int err); -- cgit v1.2.3 From 168f5ac668f63dfb64439766e3ef9e866b83719d Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Tue, 16 Jun 2009 15:33:02 -0700 Subject: mm cleanup: shmem_file_setup: 'char *' -> 'const char *' for name argument As function shmem_file_setup does not modify/allocate/free/pass given filename - mark it as const. Signed-off-by: Sergei Trofimovich Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index f0473a88ca2e..d88d6fc530ad 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -724,7 +724,7 @@ static inline int shmem_lock(struct file *file, int lock, return 0; } #endif -struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags); +struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); int shmem_zero_setup(struct vm_area_struct *); -- cgit v1.2.3 From 6fe6b7e35785e3232ffe7f81d3893f1316710a02 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:33:05 -0700 Subject: vmscan: report vm_flags in page_referenced() Collect vma->vm_flags of the VMAs that actually referenced the page. This is preparing for more informed reclaim heuristics, eg. to protect executable file pages more aggressively. For now only the VM_EXEC bit will be used by the caller. Thanks to Johannes, Peter and Minchan for all the good tips. Acked-by: Peter Zijlstra Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Reviewed-by: Johannes Weiner Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 619379a1dd98..216d024f830d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -83,7 +83,8 @@ static inline void page_dup_rmap(struct page *page, struct vm_area_struct *vma, /* * Called from mm/vmscan.c to handle paging out */ -int page_referenced(struct page *, int is_locked, struct mem_cgroup *cnt); +int page_referenced(struct page *, int is_locked, + struct mem_cgroup *cnt, unsigned long *vm_flags); int try_to_unmap(struct page *, int ignore_refs); /* @@ -117,7 +118,7 @@ int try_to_munlock(struct page *); #define anon_vma_prepare(vma) (0) #define anon_vma_link(vma) do {} while (0) -#define page_referenced(page,l,cnt) TestClearPageReferenced(page) +#define page_referenced(page, locked, cnt, flags) TestClearPageReferenced(page) #define try_to_unmap(page, refs) SWAP_FAIL static inline int page_mkclean(struct page *page) -- cgit v1.2.3 From 24cf72518c79cdcda486ed26074ff8151291cf65 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:33:23 -0700 Subject: vmscan: count the number of times zone_reclaim() scans and fails On NUMA machines, the administrator can configure zone_reclaim_mode that is a more targetted form of direct reclaim. On machines with large NUMA distances for example, a zone_reclaim_mode defaults to 1 meaning that clean unmapped pages will be reclaimed if the zone watermarks are not being met. There is a heuristic that determines if the scan is worthwhile but it is possible that the heuristic will fail and the CPU gets tied up scanning uselessly. Detecting the situation requires some guesswork and experimentation so this patch adds a counter "zreclaim_failed" to /proc/vmstat. If during high CPU utilisation this counter is increasing rapidly, then the resolution to the problem may be to set /proc/sys/vm/zone_reclaim_mode to 0. [akpm@linux-foundation.org: name things consistently] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Christoph Lameter Reviewed-by: KOSAKI Motohiro Cc: Wu Fengguang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index ff4696c6dce3..81a97cf8f0a0 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -36,6 +36,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGSTEAL), FOR_ALL_ZONES(PGSCAN_KSWAPD), FOR_ALL_ZONES(PGSCAN_DIRECT), +#ifdef CONFIG_NUMA + PGSCAN_ZONE_RECLAIM_FAILED, +#endif PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL, PAGEOUTRUN, ALLOCSTALL, PGROTATED, #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.2.3 From a7d932af06e8eee2163627d19898e18da5635449 Mon Sep 17 00:00:00 2001 From: Dan Smith Date: Tue, 16 Jun 2009 15:33:33 -0700 Subject: utsname.h: make new_utsname fields use the proper length constant The members of the new_utsname structure are defined with magic numbers that *should* correspond to the constant __NEW_UTS_LEN+1. Everywhere else, code assumes this and uses the constant, so this patch makes the structure match. Originally suggested by Serge here: https://lists.linux-foundation.org/pipermail/containers/2009-March/016258.html Signed-off-by: Dan Smith Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/utsname.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 11232676bfff..3656b300de3a 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -22,12 +22,12 @@ struct old_utsname { }; struct new_utsname { - char sysname[65]; - char nodename[65]; - char release[65]; - char version[65]; - char machine[65]; - char domainname[65]; + char sysname[__NEW_UTS_LEN + 1]; + char nodename[__NEW_UTS_LEN + 1]; + char release[__NEW_UTS_LEN + 1]; + char version[__NEW_UTS_LEN + 1]; + char machine[__NEW_UTS_LEN + 1]; + char domainname[__NEW_UTS_LEN + 1]; }; #ifdef __KERNEL__ -- cgit v1.2.3 From 4938d7e0233a455f04507bac81d0886c71529537 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Jun 2009 15:33:36 -0700 Subject: poll: avoid extra wakeups in select/poll After introduction of keyed wakeups Davide Libenzi did on epoll, we are able to avoid spurious wakeups in poll()/select() code too. For example, typical use of poll()/select() is to wait for incoming network frames on many sockets. But TX completion for UDP/TCP frames call sock_wfree() which in turn schedules thread. When scheduled, thread does a full scan of all polled fds and can sleep again, because nothing is really available. If number of fds is large, this cause significant load. This patch makes select()/poll() aware of keyed wakeups and useless wakeups are avoided. This reduces number of context switches by about 50% on some setups, and work performed by sofirq handlers. Signed-off-by: Eric Dumazet Acked-by: David S. Miller Acked-by: Andi Kleen Acked-by: Ingo Molnar Acked-by: Davide Libenzi Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/poll.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/poll.h b/include/linux/poll.h index 8c24ef8d9976..fa287f25138d 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -32,6 +32,7 @@ typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_ typedef struct poll_table_struct { poll_queue_proc qproc; + unsigned long key; } poll_table; static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) @@ -43,10 +44,12 @@ static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_addres static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) { pt->qproc = qproc; + pt->key = ~0UL; /* all events enabled */ } struct poll_table_entry { struct file *filp; + unsigned long key; wait_queue_t wait; wait_queue_head_t *wait_address; }; -- cgit v1.2.3 From 0d9c25dde878a636ee9a9b53923569171bf9a55b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 16 Jun 2009 15:33:37 -0700 Subject: headers: move module_bug_finalize()/module_bug_cleanup() definitions into module.h They're in linux/bug.h at present, which causes include order tangles. In particular, linux/bug.h cannot be used by linux/atomic.h because, according to Nikanth: linux/bug.h pulls in linux/module.h => linux/spinlock.h => asm/spinlock.h (which uses atomic_inc) => asm/atomic.h. bug.h is a pretty low-level thing and module.h is a higher-level thing, IMO. Cc: Nikanth Karthikesan Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bug.h | 12 ------------ include/linux/module.h | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/bug.h b/include/linux/bug.h index 54398d2c6d8d..d276b5510c83 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -1,7 +1,6 @@ #ifndef _LINUX_BUG_H #define _LINUX_BUG_H -#include #include enum bug_trap_type { @@ -24,10 +23,6 @@ const struct bug_entry *find_bug(unsigned long bugaddr); enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs); -int module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, - struct module *); -void module_bug_cleanup(struct module *); - /* These are defined by the architecture */ int is_valid_bugaddr(unsigned long addr); @@ -38,13 +33,6 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr, { return BUG_TRAP_TYPE_BUG; } -static inline int module_bug_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *mod) -{ - return 0; -} -static inline void module_bug_cleanup(struct module *mod) {} #endif /* CONFIG_GENERIC_BUG */ #endif /* _LINUX_BUG_H */ diff --git a/include/linux/module.h b/include/linux/module.h index a7bc6e7b43a7..505f20dcc1c7 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -697,4 +697,21 @@ static inline void module_remove_modinfo_attrs(struct module *mod) #define __MODULE_STRING(x) __stringify(x) + +#ifdef CONFIG_GENERIC_BUG +int module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, + struct module *); +void module_bug_cleanup(struct module *); + +#else /* !CONFIG_GENERIC_BUG */ + +static inline int module_bug_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *mod) +{ + return 0; +} +static inline void module_bug_cleanup(struct module *mod) {} +#endif /* CONFIG_GENERIC_BUG */ + #endif /* _LINUX_MODULE_H */ -- cgit v1.2.3 From 10fc89d01a7ea2ecc2a58d2f4e7700f47178cd62 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 16 Jun 2009 15:33:38 -0700 Subject: drbd: add major number to major.h Since we have had a LANANA major number for years, and it is documented in devices.txt, I think that this first patch can go upstream. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/major.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/major.h b/include/linux/major.h index 058ec15dd060..6a8ca98c9a96 100644 --- a/include/linux/major.h +++ b/include/linux/major.h @@ -145,6 +145,7 @@ #define UNIX98_PTY_MAJOR_COUNT 8 #define UNIX98_PTY_SLAVE_MAJOR (UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) +#define DRBD_MAJOR 147 #define RTF_MAJOR 150 #define RAW_MAJOR 162 -- cgit v1.2.3 From 8b0b1db0133e4218a9b45c09e53793c039edebe1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 Jun 2009 15:33:39 -0700 Subject: remove put_cpu_no_resched() put_cpu_no_resched() is an optimization of put_cpu() which unfortunately can cause high latencies. The nfs iostats code uses put_cpu_no_resched() in a code sequence where a reschedule request caused by an interrupt between the get_cpu() and the put_cpu_no_resched() can delay the reschedule for at least HZ. The other users of put_cpu_no_resched() optimize correctly in interrupt code, but there is no real harm in using the put_cpu() function which is an alias for preempt_enable(). The extra check of the preemmpt count is not as critical as the potential source of missing a reschedule. Debugged in the preempt-rt tree and verified in mainline. Impact: remove a high latency source [akpm@linux-foundation.org: build fix] Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Tony Luck Cc: Trond Myklebust Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/smp.h b/include/linux/smp.h index a69db820eed6..9e3d8af09207 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -177,7 +177,6 @@ static inline void init_call_single_data(void) #define get_cpu() ({ preempt_disable(); smp_processor_id(); }) #define put_cpu() preempt_enable() -#define put_cpu_no_resched() preempt_enable_no_resched() /* * Callback to arch code if there's nosmp or maxcpus=0 on the -- cgit v1.2.3 From e4c9dd0fbad60c098a026e9b06d9de1bc98c5e89 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 16 Jun 2009 15:33:47 -0700 Subject: kmap_types: make most arches use generic header file Convert most arches to use asm-generic/kmap_types.h. Move the KM_FENCE_ macro additions into asm-generic/kmap_types.h, controlled by __WITH_KM_FENCE from each arch's kmap_types.h file. Would be nice to be able to add custom KM_types per arch, but I don't yet see a nice, clean way to do that. Built on x86_64, i386, mips, sparc, alpha(tonyb), powerpc(tonyb), and 68k(tonyb). Note: avr32 should be able to remove KM_PTE2 (since it's not used) and then just use the generic kmap_types.h file. Get avr32 maintainer approval. Signed-off-by: Randy Dunlap Cc: Acked-by: Mike Frysinger Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Bryan Wu Cc: Mikael Starvik Cc: Hirokazu Takata Cc: "Luck Tony" Cc: Geert Uytterhoeven Cc: Ralf Baechle Cc: David Howells Cc: Kyle McMartin Cc: Martin Schwidefsky Cc: Paul Mundt Cc: "David S. Miller" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Arnd Bergmann Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/kmap_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/kmap_types.h b/include/asm-generic/kmap_types.h index 58c33055c304..54e8b3d956b7 100644 --- a/include/asm-generic/kmap_types.h +++ b/include/asm-generic/kmap_types.h @@ -1,7 +1,7 @@ #ifndef _ASM_GENERIC_KMAP_TYPES_H #define _ASM_GENERIC_KMAP_TYPES_H -#ifdef CONFIG_DEBUG_HIGHMEM +#ifdef __WITH_KM_FENCE # define D(n) __KM_FENCE_##n , #else # define D(n) -- cgit v1.2.3 From cc6f26774136b7f5307abcd3887f08360c9b7554 Mon Sep 17 00:00:00 2001 From: Masatake YAMATO Date: Tue, 16 Jun 2009 15:33:49 -0700 Subject: syscalls.h: remove duplicated declarations for sys_pipe2 sys_pipe2 is declared twice in include/linux/syscalls.h. Signed-off-by: Masatake YAMATO Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 418d90f5effe..fa4242cdade8 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -434,6 +434,7 @@ asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg); asmlinkage long sys_fcntl64(unsigned int fd, unsigned int cmd, unsigned long arg); #endif +asmlinkage long sys_pipe(int __user *fildes); asmlinkage long sys_pipe2(int __user *fildes, int flags); asmlinkage long sys_dup(unsigned int fildes); asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd); @@ -751,8 +752,6 @@ asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int, struct timespec __user *, const sigset_t __user *, size_t); -asmlinkage long sys_pipe2(int __user *, int); -asmlinkage long sys_pipe(int __user *); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -- cgit v1.2.3 From 55e331cf7ebe20665253770589cd9eb06048bf25 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 16 Jun 2009 15:33:53 -0700 Subject: drivers: add support for the TI VLYNQ bus Add support for the TI VLYNQ high-speed, serial and packetized bus. This bus allows external devices to be connected to the System-on-Chip and appear in the main system memory just like any memory mapped peripheral. It is widely used in TI's networking and multimedia SoC, including the AR7 SoC. Signed-off-by: Eugene Konev Signed-off-by: Florian Fainelli Cc: Ralf Baechle Cc: Alan Cox Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vlynq.h | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 include/linux/vlynq.h (limited to 'include') diff --git a/include/linux/vlynq.h b/include/linux/vlynq.h new file mode 100644 index 000000000000..8f6a95882b09 --- /dev/null +++ b/include/linux/vlynq.h @@ -0,0 +1,161 @@ +/* + * Copyright (C) 2006, 2007 Eugene Konev + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef __VLYNQ_H__ +#define __VLYNQ_H__ + +#include +#include +#include + +#define VLYNQ_NUM_IRQS 32 + +struct vlynq_mapping { + u32 size; + u32 offset; +}; + +enum vlynq_divisor { + vlynq_div_auto = 0, + vlynq_ldiv1, + vlynq_ldiv2, + vlynq_ldiv3, + vlynq_ldiv4, + vlynq_ldiv5, + vlynq_ldiv6, + vlynq_ldiv7, + vlynq_ldiv8, + vlynq_rdiv1, + vlynq_rdiv2, + vlynq_rdiv3, + vlynq_rdiv4, + vlynq_rdiv5, + vlynq_rdiv6, + vlynq_rdiv7, + vlynq_rdiv8, + vlynq_div_external +}; + +struct vlynq_device_id { + u32 id; + enum vlynq_divisor divisor; + unsigned long driver_data; +}; + +struct vlynq_regs; +struct vlynq_device { + u32 id, dev_id; + int local_irq; + int remote_irq; + enum vlynq_divisor divisor; + u32 regs_start, regs_end; + u32 mem_start, mem_end; + u32 irq_start, irq_end; + int irq; + int enabled; + struct vlynq_regs *local; + struct vlynq_regs *remote; + struct device dev; +}; + +struct vlynq_driver { + char *name; + struct vlynq_device_id *id_table; + int (*probe)(struct vlynq_device *dev, struct vlynq_device_id *id); + void (*remove)(struct vlynq_device *dev); + struct device_driver driver; +}; + +struct plat_vlynq_ops { + int (*on)(struct vlynq_device *dev); + void (*off)(struct vlynq_device *dev); +}; + +static inline struct vlynq_driver *to_vlynq_driver(struct device_driver *drv) +{ + return container_of(drv, struct vlynq_driver, driver); +} + +static inline struct vlynq_device *to_vlynq_device(struct device *device) +{ + return container_of(device, struct vlynq_device, dev); +} + +extern struct bus_type vlynq_bus_type; + +extern int __vlynq_register_driver(struct vlynq_driver *driver, + struct module *owner); + +static inline int vlynq_register_driver(struct vlynq_driver *driver) +{ + return __vlynq_register_driver(driver, THIS_MODULE); +} + +static inline void *vlynq_get_drvdata(struct vlynq_device *dev) +{ + return dev_get_drvdata(&dev->dev); +} + +static inline void vlynq_set_drvdata(struct vlynq_device *dev, void *data) +{ + dev_set_drvdata(&dev->dev, data); +} + +static inline u32 vlynq_mem_start(struct vlynq_device *dev) +{ + return dev->mem_start; +} + +static inline u32 vlynq_mem_end(struct vlynq_device *dev) +{ + return dev->mem_end; +} + +static inline u32 vlynq_mem_len(struct vlynq_device *dev) +{ + return dev->mem_end - dev->mem_start + 1; +} + +static inline int vlynq_virq_to_irq(struct vlynq_device *dev, int virq) +{ + int irq = dev->irq_start + virq; + if ((irq < dev->irq_start) || (irq > dev->irq_end)) + return -EINVAL; + + return irq; +} + +static inline int vlynq_irq_to_virq(struct vlynq_device *dev, int irq) +{ + if ((irq < dev->irq_start) || (irq > dev->irq_end)) + return -EINVAL; + + return irq - dev->irq_start; +} + +extern void vlynq_unregister_driver(struct vlynq_driver *driver); +extern int vlynq_enable_device(struct vlynq_device *dev); +extern void vlynq_disable_device(struct vlynq_device *dev); +extern int vlynq_set_local_mapping(struct vlynq_device *dev, u32 tx_offset, + struct vlynq_mapping *mapping); +extern int vlynq_set_remote_mapping(struct vlynq_device *dev, u32 tx_offset, + struct vlynq_mapping *mapping); +extern int vlynq_set_local_irq(struct vlynq_device *dev, int virq); +extern int vlynq_set_remote_irq(struct vlynq_device *dev, int virq); + +#endif /* __VLYNQ_H__ */ -- cgit v1.2.3 From 8f3128e714ded7cf1e8c786c204a4f253b5d8ff4 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Tue, 16 Jun 2009 15:34:17 -0700 Subject: lis3: add click function The LIS302DL accelerometer chip has a 'click' feature which can be used to detect sudden motion on any of the three axis. Configuration data is passed via spi platform_data and no action is taken if that's not specified, so it won't harm any existing platform. To make the configuration effective, the IRQ lines need to be set up appropriately. This patch also adds a way to do that from board support code. The DD_* definitions were factored out to an own enum because they are specific to LIS3LV02D devices. Signed-off-by: Daniel Mack Acked-by: Pavel Machek Acked-by: Eric Piel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lis3lv02d.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 include/linux/lis3lv02d.h (limited to 'include') diff --git a/include/linux/lis3lv02d.h b/include/linux/lis3lv02d.h new file mode 100644 index 000000000000..ad651f4e45ac --- /dev/null +++ b/include/linux/lis3lv02d.h @@ -0,0 +1,39 @@ +#ifndef __LIS3LV02D_H_ +#define __LIS3LV02D_H_ + +struct lis3lv02d_platform_data { + /* please note: the 'click' feature is only supported for + * LIS[32]02DL variants of the chip and will be ignored for + * others */ +#define LIS3_CLICK_SINGLE_X (1 << 0) +#define LIS3_CLICK_DOUBLE_X (1 << 1) +#define LIS3_CLICK_SINGLE_Y (1 << 2) +#define LIS3_CLICK_DOUBLE_Y (1 << 3) +#define LIS3_CLICK_SINGLE_Z (1 << 4) +#define LIS3_CLICK_DOUBLE_Z (1 << 5) + unsigned char click_flags; + unsigned char click_thresh_x; + unsigned char click_thresh_y; + unsigned char click_thresh_z; + unsigned char click_time_limit; + unsigned char click_latency; + unsigned char click_window; + +#define LIS3_IRQ1_DISABLE (0 << 0) +#define LIS3_IRQ1_FF_WU_1 (1 << 0) +#define LIS3_IRQ1_FF_WU_2 (2 << 0) +#define LIS3_IRQ1_FF_WU_12 (3 << 0) +#define LIS3_IRQ1_DATA_READY (4 << 0) +#define LIS3_IRQ1_CLICK (7 << 0) +#define LIS3_IRQ2_DISABLE (0 << 3) +#define LIS3_IRQ2_FF_WU_1 (1 << 3) +#define LIS3_IRQ2_FF_WU_2 (2 << 3) +#define LIS3_IRQ2_FF_WU_12 (3 << 3) +#define LIS3_IRQ2_DATA_READY (4 << 3) +#define LIS3_IRQ2_CLICK (7 << 3) +#define LIS3_IRQ_OPEN_DRAIN (1 << 6) +#define LIS3_IRQ_ACTIVE_HIGH (1 << 7) + unsigned char irq_cfg; +}; + +#endif /* __LIS3LV02D_H_ */ -- cgit v1.2.3 From ae52bb2384f721562f15f719de1acb8e934733cb Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 16 Jun 2009 15:34:19 -0700 Subject: fbdev: move logo externs to header file Now we have __initconst, we can finally move the external declarations for the various Linux logo structures to . James' ack dates back to the previous submission (way to long ago), when the logos were still __initdata, which caused failures on some platforms with some toolchain versions. Signed-off-by: Geert Uytterhoeven Acked-by: James Simmons Cc: Krzysztof Helt Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/init.h | 2 +- include/linux/linux_logo.h | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/init.h b/include/linux/init.h index b2189803f19a..8c2c9989626d 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -29,7 +29,7 @@ * sign followed by value, e.g.: * * static int init_variable __initdata = 0; - * static char linux_logo[] __initdata = { 0x32, 0x36, ... }; + * static const char linux_logo[] __initconst = { 0x32, 0x36, ... }; * * Don't forget to initialize data not at file scope, i.e. within a function, * as gcc otherwise puts the data into the bss section and not into the init diff --git a/include/linux/linux_logo.h b/include/linux/linux_logo.h index 08a92969c76e..ca5bd91d12e1 100644 --- a/include/linux/linux_logo.h +++ b/include/linux/linux_logo.h @@ -32,6 +32,22 @@ struct linux_logo { const unsigned char *data; }; +extern const struct linux_logo logo_linux_mono; +extern const struct linux_logo logo_linux_vga16; +extern const struct linux_logo logo_linux_clut224; +extern const struct linux_logo logo_blackfin_vga16; +extern const struct linux_logo logo_blackfin_clut224; +extern const struct linux_logo logo_dec_clut224; +extern const struct linux_logo logo_mac_clut224; +extern const struct linux_logo logo_parisc_clut224; +extern const struct linux_logo logo_sgi_clut224; +extern const struct linux_logo logo_sun_clut224; +extern const struct linux_logo logo_superh_mono; +extern const struct linux_logo logo_superh_vga16; +extern const struct linux_logo logo_superh_clut224; +extern const struct linux_logo logo_m32r_clut224; +extern const struct linux_logo logo_spe_clut224; + extern const struct linux_logo *fb_find_logo(int depth); #ifdef CONFIG_FB_LOGO_EXTRA extern void fb_append_extra_logo(const struct linux_logo *logo, -- cgit v1.2.3 From 4410f3910947dcea8672280b3adecd53cec4e85e Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Tue, 16 Jun 2009 15:34:38 -0700 Subject: fbdev: add support for handoff from firmware to hw framebuffers With KMS we have ran into an issue where we really want the KMS fb driver to be the one running the console, so panics etc can be shown by switching out of X etc. However with vesafb/efifb built-in, we end up with those on fb0 and the KMS fb driver on fb1, driving the same piece of hw, so this adds an fb info flag to denote a firmware fbdev, and adds a new aperture base/size range which can be compared when the hw drivers are installed to see if there is a conflict with a firmware driver, and if there is the firmware driver is unregistered and the hw driver takes over. It uses new aperture_base/size members instead of comparing on the fix smem_start/length, as smem_start/length might for example only cover the first 1MB of the PCI aperture, and we could allocate the kms fb from 8MB into the aperture, thus they would never overlap. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Dave Airlie Acked-by: Peter Jones Cc: Geert Uytterhoeven Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fb.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fb.h b/include/linux/fb.h index 330c4b1bfcaa..3c5562a52167 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -677,6 +677,9 @@ struct fb_ops { /* get capability given var */ void (*fb_get_caps)(struct fb_info *info, struct fb_blit_caps *caps, struct fb_var_screeninfo *var); + + /* teardown any resources to do with this framebuffer */ + void (*fb_destroy)(struct fb_info *info); }; #ifdef CONFIG_FB_TILEBLITTING @@ -786,6 +789,8 @@ struct fb_tile_ops { #define FBINFO_MISC_USEREVENT 0x10000 /* event request from userspace */ #define FBINFO_MISC_TILEBLITTING 0x20000 /* use tile blitting */ +#define FBINFO_MISC_FIRMWARE 0x40000 /* a replaceable firmware + inited framebuffer */ /* A driver may set this flag to indicate that it does want a set_par to be * called every time when fbcon_switch is executed. The advantage is that with @@ -854,7 +859,12 @@ struct fb_info { u32 state; /* Hardware state i.e suspend */ void *fbcon_par; /* fbcon use-only private area */ /* From here on everything is device dependent */ - void *par; + void *par; + /* we need the PCI or similiar aperture base/size not + smem_start/size as smem_start may just be an object + allocated inside the aperture so may not actually overlap */ + resource_size_t aperture_base; + resource_size_t aperture_size; }; #ifdef MODULE -- cgit v1.2.3 From 3ed167af96ed098187ea41353fe02d1af20d38a1 Mon Sep 17 00:00:00 2001 From: Kristoffer Ericson Date: Tue, 16 Jun 2009 15:34:40 -0700 Subject: fbdev: s1d13xxxfb: add accelerated bitblt functions Add accelerated bitblt functions to s1d13xxx based video chipsets, more specificly functions copyarea and fillrect. It has only been tested and activated for 13506 chipsets but is expected to work for the majority of s1d13xxx based chips. This patch also cleans up the driver with respect of whitespaces and other formatting issues. We update the current status comments. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Kristoffer Ericson Cc: Russell King Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/video/s1d13xxxfb.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/video/s1d13xxxfb.h b/include/video/s1d13xxxfb.h index c3b2a2aa7140..f0736cff2ca3 100644 --- a/include/video/s1d13xxxfb.h +++ b/include/video/s1d13xxxfb.h @@ -136,6 +136,15 @@ #define S1DREG_DELAYOFF 0xFFFE #define S1DREG_DELAYON 0xFFFF +#define BBLT_FIFO_EMPTY 0x00 +#define BBLT_FIFO_NOT_EMPTY 0x40 +#define BBLT_FIFO_NOT_FULL 0x30 +#define BBLT_FIFO_HALF_FULL 0x20 +#define BBLT_FIFO_FULL 0x10 + +#define BBLT_SOLID_FILL 0x0c + + /* Note: all above defines should go in separate header files when implementing other S1D13xxx chip support. */ -- cgit v1.2.3 From 00115e6690ed5aa90530e1e41c467cd895dd18fc Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 16 Jun 2009 15:34:40 -0700 Subject: fbdev: blackfin has __raw I/O accessors, so use them in fb.h Signed-off-by: Mike Frysinger Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fb.h b/include/linux/fb.h index 3c5562a52167..dd68358996b7 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -903,7 +903,7 @@ struct fb_info { #define fb_writeq sbus_writeq #define fb_memset sbus_memset_io -#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || defined(__sh__) || defined(__powerpc__) || defined(__avr32__) +#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || defined(__sh__) || defined(__powerpc__) || defined(__avr32__) || defined(__bfin__) #define fb_readb __raw_readb #define fb_readw __raw_readw -- cgit v1.2.3