From 95a7d76897c1e7243d4137037c66d15cbf2cce76 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 31 Oct 2012 12:38:31 -0400 Subject: xen/mmu: Use Xen specific TLB flush instead of the generic one. As Mukesh explained it, the MMUEXT_TLB_FLUSH_ALL allows the hypervisor to do a TLB flush on all active vCPUs. If instead we were using the generic one (which ends up being xen_flush_tlb) we end up making the MMUEXT_TLB_FLUSH_LOCAL hypercall. But before we make that hypercall the kernel will IPI all of the vCPUs (even those that were asleep from the hypervisor perspective). The end result is that we needlessly wake them up and do a TLB flush when we can just let the hypervisor do it correctly. This patch gives around 50% speed improvement when migrating idle guest's from one host to another. Oracle-bug: 14630170 CC: stable@vger.kernel.org Tested-by: Jingjie Jiang Suggested-by: Mukesh Rathor Signed-off-by: Konrad Rzeszutek Wilk --- include/trace/events/xen.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h index 15ba03bdd7c6..d06b6da5c1e3 100644 --- a/include/trace/events/xen.h +++ b/include/trace/events/xen.h @@ -377,6 +377,14 @@ DECLARE_EVENT_CLASS(xen_mmu_pgd, DEFINE_XEN_MMU_PGD_EVENT(xen_mmu_pgd_pin); DEFINE_XEN_MMU_PGD_EVENT(xen_mmu_pgd_unpin); +TRACE_EVENT(xen_mmu_flush_tlb_all, + TP_PROTO(int x), + TP_ARGS(x), + TP_STRUCT__entry(__array(char, x, 0)), + TP_fast_assign((void)x), + TP_printk("%s", "") + ); + TRACE_EVENT(xen_mmu_flush_tlb, TP_PROTO(int x), TP_ARGS(x), -- cgit v1.2.3 From 01e3e710a9265fb7092efd67243d7b6dd6e2548a Mon Sep 17 00:00:00 2001 From: David Sharp Date: Thu, 7 Jun 2012 16:46:24 -0700 Subject: tracing: Trivial cleanup Remove ftrace_format_syscall() declaration; it is neither defined nor used. Also update a comment and formatting. Link: http://lkml.kernel.org/r/1339112785-21806-1-git-send-email-vnagarnaik@google.com Signed-off-by: David Sharp Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- include/trace/syscall.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/trace') diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 31966a4fb8cc..0c95796177d7 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -39,8 +39,6 @@ extern int reg_event_syscall_enter(struct ftrace_event_call *call); extern void unreg_event_syscall_enter(struct ftrace_event_call *call); extern int reg_event_syscall_exit(struct ftrace_event_call *call); extern void unreg_event_syscall_exit(struct ftrace_event_call *call); -extern int -ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s); enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event); enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags, -- cgit v1.2.3 From 6f86ab9fcaef122abb837819139eadac1a0ca966 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Thu, 7 Jun 2012 16:46:25 -0700 Subject: tracing: Cleanup unnecessary function declarations The functions defined in include/trace/syscalls.h are not used directly since struct ftrace_event_class was introduced. Remove them from the header file and rearrange the ftrace_event_class declarations in trace_syscalls.c. Link: http://lkml.kernel.org/r/1339112785-21806-2-git-send-email-vnagarnaik@google.com Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- include/trace/syscall.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include/trace') diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 0c95796177d7..84bc4197e736 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -31,25 +31,4 @@ struct syscall_metadata { struct ftrace_event_call *exit_event; }; -#ifdef CONFIG_FTRACE_SYSCALLS -extern unsigned long arch_syscall_addr(int nr); -extern int init_syscall_trace(struct ftrace_event_call *call); - -extern int reg_event_syscall_enter(struct ftrace_event_call *call); -extern void unreg_event_syscall_enter(struct ftrace_event_call *call); -extern int reg_event_syscall_exit(struct ftrace_event_call *call); -extern void unreg_event_syscall_exit(struct ftrace_event_call *call); -enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, - struct trace_event *event); -enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags, - struct trace_event *event); -#endif - -#ifdef CONFIG_PERF_EVENTS -int perf_sysenter_enable(struct ftrace_event_call *call); -void perf_sysenter_disable(struct ftrace_event_call *call); -int perf_sysexit_enable(struct ftrace_event_call *call); -void perf_sysexit_disable(struct ftrace_event_call *call); -#endif - #endif /* _TRACE_SYSCALL_H */ -- cgit v1.2.3 From 0d5c6e1c19bab82fad4837108c2902f557d62a04 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 1 Nov 2012 20:54:21 -0400 Subject: tracing: Use irq_work for wake ups and remove *_nowake_*() functions Have the ring buffer commit function use the irq_work infrastructure to wake up any waiters waiting on the ring buffer for new data. The irq_work was created for such a purpose, where doing the actual wake up at the time of adding data is too dangerous, as an event or function trace may be in the midst of the work queue locks and cause deadlocks. The irq_work will either delay the action to the next timer interrupt, or trigger an IPI to itself forcing an interrupt to do the work (in a safe location). With irq_work, all ring buffer commits can safely do wakeups, removing the need for the ring buffer commit "nowake" variants, which were used by events and function tracing. All commits can now safely use the normal commit, and the "nowake" variants can be removed. Cc: Peter Zijlstra Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/trace') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index a763888a36f9..698f2a890322 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -545,8 +545,7 @@ ftrace_raw_event_##call(void *__data, proto) \ { assign; } \ \ if (!filter_current_check_discard(buffer, event_call, entry, event)) \ - trace_nowake_buffer_unlock_commit(buffer, \ - event, irq_flags, pc); \ + trace_buffer_unlock_commit(buffer, event, irq_flags, pc); \ } /* * The ftrace_test_probe is compiled out, it is only here as a build time check -- cgit v1.2.3 From b5645534ce84c21695c2f82d4d4f67cf2a67229a Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 14:33:43 -0500 Subject: ext4: print 'flags' in ext4_ext_handle_uninitialized_extents In trace_ext4_ext_handle_uninitialized_extents we don't care about the value of map->m_flags because this value is probably 0, and we prefer to get the value of flags because we can know how to handle this extent in this function. Reviewed-by: Lukas Czerner Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index d49b285385e8..25914e3002c7 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1680,10 +1680,10 @@ DEFINE_EVENT(ext4__trim, ext4_trim_all_free, ); TRACE_EVENT(ext4_ext_handle_uninitialized_extents, - TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags, unsigned int allocated, ext4_fsblk_t newblock), - TP_ARGS(inode, map, allocated, newblock), + TP_ARGS(inode, map, flags, allocated, newblock), TP_STRUCT__entry( __field( dev_t, dev ) @@ -1699,7 +1699,7 @@ TRACE_EVENT(ext4_ext_handle_uninitialized_extents, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->flags = map->m_flags; + __entry->flags = flags; __entry->lblk = map->m_lblk; __entry->pblk = map->m_pblk; __entry->len = map->m_len; @@ -1707,7 +1707,7 @@ TRACE_EVENT(ext4_ext_handle_uninitialized_extents, __entry->newblk = newblock; ), - TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %d" + TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %x " "allocated %d newblock %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, -- cgit v1.2.3 From 19b303d8b5a0e8150a4697c01ca03e75a0a17469 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 14:34:04 -0500 Subject: ext4: print map->m_flags in trace_ext4_ext/ind_map_blocks_exit When we use trace_ext4_ext/ind_map_blocks_exit, print the value of map->m_flags in order that we can understand the extent's current status. Reviewed-by: Lukas Czerner Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 25914e3002c7..d2a125a6db8b 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1519,10 +1519,9 @@ DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter, ); DECLARE_EVENT_CLASS(ext4__map_blocks_exit, - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, - ext4_fsblk_t pblk, unsigned int len, int ret), + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), - TP_ARGS(inode, lblk, pblk, len, ret), + TP_ARGS(inode, map, ret), TP_STRUCT__entry( __field( dev_t, dev ) @@ -1530,37 +1529,37 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_exit, __field( ext4_fsblk_t, pblk ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) + __field( unsigned int, flags ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->pblk = pblk; - __entry->lblk = lblk; - __entry->len = len; + __entry->pblk = map->m_pblk; + __entry->lblk = map->m_lblk; + __entry->len = map->m_len; + __entry->flags = map->m_flags; __entry->ret = ret; ), - TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d", + TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u flags %x ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->pblk, - __entry->len, __entry->ret) + __entry->len, __entry->flags, __entry->ret) ); DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit, - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, - ext4_fsblk_t pblk, unsigned len, int ret), + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), - TP_ARGS(inode, lblk, pblk, len, ret) + TP_ARGS(inode, map, ret) ); DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit, - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, - ext4_fsblk_t pblk, unsigned len, int ret), + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), - TP_ARGS(inode, lblk, pblk, len, ret) + TP_ARGS(inode, map, ret) ); TRACE_EVENT(ext4_ext_load_extent, -- cgit v1.2.3 From 992e9fdd7b3f656ab8aea895f0038336950774ed Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 21:57:33 -0500 Subject: ext4: add some tracepoints in extent status tree This patch adds some tracepoints in extent status tree. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 101 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index d2a125a6db8b..f6372b011366 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -15,6 +15,7 @@ struct ext4_inode_info; struct mpage_da_data; struct ext4_map_blocks; struct ext4_extent; +struct extent_status; #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) @@ -2054,6 +2055,106 @@ TRACE_EVENT(ext4_ext_remove_space_done, (unsigned short) __entry->eh_entries) ); +TRACE_EVENT(ext4_es_insert_extent, + TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len), + + TP_ARGS(inode, start, len), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, start ) + __field( loff_t, len ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start = start; + __entry->len = len; + ), + + TP_printk("dev %d,%d ino %lu es [%lld/%lld)", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->start, __entry->len) +); + +TRACE_EVENT(ext4_es_remove_extent, + TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len), + + TP_ARGS(inode, start, len), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, start ) + __field( loff_t, len ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start = start; + __entry->len = len; + ), + + TP_printk("dev %d,%d ino %lu es [%lld/%lld)", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->start, __entry->len) +); + +TRACE_EVENT(ext4_es_find_extent_enter, + TP_PROTO(struct inode *inode, ext4_lblk_t start), + + TP_ARGS(inode, start), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ext4_lblk_t, start ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start = start; + ), + + TP_printk("dev %d,%d ino %lu start %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, __entry->start) +); + +TRACE_EVENT(ext4_es_find_extent_exit, + TP_PROTO(struct inode *inode, struct extent_status *es, + ext4_lblk_t ret), + + TP_ARGS(inode, es, ret), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ext4_lblk_t, start ) + __field( ext4_lblk_t, len ) + __field( ext4_lblk_t, ret ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start = es->start; + __entry->len = es->len; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d ino %lu es [%u/%u) ret %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->start, __entry->len, __entry->ret) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 1c7d66732458dc187008e3f5b2f71e019e320fc2 Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Sat, 3 Nov 2012 12:38:33 +0800 Subject: tracing: Kill unused and puzzled sample code in ftrace.h When doing per-cpu helper optimizing work, find that this code is so puzzled. 1. It's mark as comment text, maybe a sample function for guidelines or a todo work. 2. But, this sample code is odd where struct perf_trace_buf is nonexistent. commit ce71b9 delete struct perf_trace_buf definition. Author: Frederic Weisbecker Date: Sun Nov 22 05:26:55 2009 +0100 tracing: Use the perf recursion protection from trace event Is it necessary to keep there? just compile test. Link: http://lkml.kernel.org/r/50949FC9.6050202@gmail.com Signed-off-by: Shan Wei Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 73 -------------------------------------------------- 1 file changed, 73 deletions(-) (limited to 'include/trace') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 698f2a890322..40dc5e8fe340 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -619,79 +619,6 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -/* - * Define the insertion callback to perf events - * - * The job is very similar to ftrace_raw_event_ except that we don't - * insert in the ring buffer but in a perf counter. - * - * static void ftrace_perf_(proto) - * { - * struct ftrace_data_offsets_ __maybe_unused __data_offsets; - * struct ftrace_event_call *event_call = &event_; - * extern void perf_tp_event(int, u64, u64, void *, int); - * struct ftrace_raw_##call *entry; - * struct perf_trace_buf *trace_buf; - * u64 __addr = 0, __count = 1; - * unsigned long irq_flags; - * struct trace_entry *ent; - * int __entry_size; - * int __data_size; - * int __cpu - * int pc; - * - * pc = preempt_count(); - * - * __data_size = ftrace_get_offsets_(&__data_offsets, args); - * - * // Below we want to get the aligned size by taking into account - * // the u32 field that will later store the buffer size - * __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32), - * sizeof(u64)); - * __entry_size -= sizeof(u32); - * - * // Protect the non nmi buffer - * // This also protects the rcu read side - * local_irq_save(irq_flags); - * __cpu = smp_processor_id(); - * - * if (in_nmi()) - * trace_buf = rcu_dereference_sched(perf_trace_buf_nmi); - * else - * trace_buf = rcu_dereference_sched(perf_trace_buf); - * - * if (!trace_buf) - * goto end; - * - * trace_buf = per_cpu_ptr(trace_buf, __cpu); - * - * // Avoid recursion from perf that could mess up the buffer - * if (trace_buf->recursion++) - * goto end_recursion; - * - * raw_data = trace_buf->buf; - * - * // Make recursion update visible before entering perf_tp_event - * // so that we protect from perf recursions. - * - * barrier(); - * - * //zero dead bytes from alignment to avoid stack leak to userspace: - * *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; - * entry = (struct ftrace_raw_ *)raw_data; - * ent = &entry->ent; - * tracing_generic_entry_update(ent, irq_flags, pc); - * ent->type = event_call->id; - * - * <- do some jobs with dynamic arrays - * - * <- affect our values - * - * perf_tp_event(event_call->id, __addr, __count, entry, - * __entry_size); <- submit them to perf counter - * - * } - */ #ifdef CONFIG_PERF_EVENTS -- cgit v1.2.3 From 3fbfbf7a3b66ec424042d909f14ba2ddf4372ea8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 19 Aug 2012 21:35:53 -0700 Subject: rcu: Add callback-free CPUs RCU callback execution can add significant OS jitter and also can degrade both scheduling latency and, in asymmetric multiprocessors, energy efficiency. This commit therefore adds the ability for selected CPUs ("rcu_nocbs=" boot parameter) to have their callbacks offloaded to kthreads. If the "rcu_nocb_poll" boot parameter is also specified, these kthreads will do polling, removing the need for the offloaded CPUs to do wakeups. At least one CPU must be doing normal callback processing: currently CPU 0 cannot be selected as a no-CBs CPU. In addition, attempts to offline the last normal-CBs CPU will fail. This feature was inspired by Jim Houston's and Joe Korty's JRCU, and this commit includes fixes to problems located by Fengguang Wu's kbuild test robot. [ paulmck: Added gfp.h include file as suggested by Fengguang Wu. ] Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/trace') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 5bde94d8585b..d4f559b1ec34 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -549,6 +549,7 @@ TRACE_EVENT(rcu_torture_read, * "EarlyExit": rcu_barrier_callback() piggybacked, thus early exit. * "Inc1": rcu_barrier_callback() piggyback check counter incremented. * "Offline": rcu_barrier_callback() found offline CPU + * "OnlineNoCB": rcu_barrier_callback() found online no-CBs CPU. * "OnlineQ": rcu_barrier_callback() found online CPU with callbacks. * "OnlineNQ": rcu_barrier_callback() found online CPU, no callbacks. * "IRQ": An rcu_barrier_callback() callback posted on remote CPU. -- cgit v1.2.3 From 82b212f40059bffd6808c07266a942d444d5558a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 26 Nov 2012 16:29:45 -0800 Subject: Revert "mm: remove __GFP_NO_KSWAPD" With "mm: vmscan: scale number of pages reclaimed by reclaim/compaction based on failures" reverted, Zdenek Kabelac reported the following Hmm, so it's just took longer to hit the problem and observe kswapd0 spinning on my CPU again - it's not as endless like before - but still it easily eats minutes - it helps to turn off Firefox or TB (memory hungry apps) so kswapd0 stops soon - and restart those apps again. (And I still have like >1GB of cached memory) kswapd0 R running task 0 30 2 0x00000000 Call Trace: preempt_schedule+0x42/0x60 _raw_spin_unlock+0x55/0x60 put_super+0x31/0x40 drop_super+0x22/0x30 prune_super+0x149/0x1b0 shrink_slab+0xba/0x510 The sysrq+m indicates the system has no swap so it'll never reclaim anonymous pages as part of reclaim/compaction. That is one part of the problem but not the root cause as file-backed pages could also be reclaimed. The likely underlying problem is that kswapd is woken up or kept awake for each THP allocation request in the page allocator slow path. If compaction fails for the requesting process then compaction will be deferred for a time and direct reclaim is avoided. However, if there are a storm of THP requests that are simply rejected, it will still be the the case that kswapd is awake for a prolonged period of time as pgdat->kswapd_max_order is updated each time. This is noticed by the main kswapd() loop and it will not call kswapd_try_to_sleep(). Instead it will loopp, shrinking a small number of pages and calling shrink_slab() on each iteration. The temptation is to supply a patch that checks if kswapd was woken for THP and if so ignore pgdat->kswapd_max_order but it'll be a hack and not backed up by proper testing. As 3.7 is very close to release and this is not a bug we should release with, a safer path is to revert "mm: remove __GFP_NO_KSWAPD" for now and revisit it with the view to ironing out the balance_pgdat() logic in general. Signed-off-by: Mel Gorman Cc: Zdenek Kabelac Cc: Seth Jennings Cc: Valdis Kletnieks Cc: Jiri Slaby Cc: Rik van Riel Cc: Robert Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/gfpflags.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/trace') diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index 9391706e9254..d6fd8e5b14b7 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -36,6 +36,7 @@ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ + {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \ ) : "GFP_NOWAIT" -- cgit v1.2.3 From a50915394f1fc02c2861d3b7ce7014788aa5066e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 29 Nov 2012 13:54:27 -0800 Subject: revert "Revert "mm: remove __GFP_NO_KSWAPD"" It apepars that this patch was innocent, and we hope that "mm: avoid waking kswapd for THP allocations when compaction is deferred or contended" will fix the final kswapd-spinning cause. Cc: Zdenek Kabelac Cc: Seth Jennings Cc: Valdis Kletnieks Cc: Jiri Slaby Cc: Rik van Riel Cc: Robert Jennings Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/gfpflags.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/trace') diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index d6fd8e5b14b7..9391706e9254 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -36,7 +36,6 @@ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ - {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \ ) : "GFP_NOWAIT" -- cgit v1.2.3 From caf491916b1c1e939a2c7575efb7a77f11fc9bdf Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 10 Dec 2012 10:51:16 -0800 Subject: Revert "revert "Revert "mm: remove __GFP_NO_KSWAPD""" and associated damage This reverts commits a50915394f1fc02c2861d3b7ce7014788aa5066e and d7c3b937bdf45f0b844400b7bf6fd3ed50bac604. This is a revert of a revert of a revert. In addition, it reverts the even older i915 change to stop using the __GFP_NO_KSWAPD flag due to the original commits in linux-next. It turns out that the original patch really was bogus, and that the original revert was the correct thing to do after all. We thought we had fixed the problem, and then reverted the revert, but the problem really is fundamental: waking up kswapd simply isn't the right thing to do, and direct reclaim sometimes simply _is_ the right thing to do. When certain allocations fail, we simply should try some direct reclaim, and if that fails, fail the allocation. That's the right thing to do for THP allocations, which can easily fail, and the GPU allocations want to do that too. So starting kswapd is sometimes simply wrong, and removing the flag that said "don't start kswapd" was a mistake. Let's hope we never revisit this mistake again - and certainly not this many times ;) Acked-by: Mel Gorman Acked-by: Johannes Weiner Cc: Rik van Riel Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/gfpflags.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/trace') diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index 9391706e9254..d6fd8e5b14b7 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -36,6 +36,7 @@ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ + {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \ ) : "GFP_NOWAIT" -- cgit v1.2.3 From 7b2a2d4a18fffac3c4872021529b0657896db788 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 19 Oct 2012 14:07:31 +0100 Subject: mm: migrate: Add a tracepoint for migrate_pages The pgmigrate_success and pgmigrate_fail vmstat counters tells the user about migration activity but not the type or the reason. This patch adds a tracepoint to identify the type of page migration and why the page is being migrated. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- include/trace/events/migrate.h | 51 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 include/trace/events/migrate.h (limited to 'include/trace') diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h new file mode 100644 index 000000000000..ec2a6ccfd7e5 --- /dev/null +++ b/include/trace/events/migrate.h @@ -0,0 +1,51 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM migrate + +#if !defined(_TRACE_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MIGRATE_H + +#define MIGRATE_MODE \ + {MIGRATE_ASYNC, "MIGRATE_ASYNC"}, \ + {MIGRATE_SYNC_LIGHT, "MIGRATE_SYNC_LIGHT"}, \ + {MIGRATE_SYNC, "MIGRATE_SYNC"} + +#define MIGRATE_REASON \ + {MR_COMPACTION, "compaction"}, \ + {MR_MEMORY_FAILURE, "memory_failure"}, \ + {MR_MEMORY_HOTPLUG, "memory_hotplug"}, \ + {MR_SYSCALL, "syscall_or_cpuset"}, \ + {MR_MEMPOLICY_MBIND, "mempolicy_mbind"}, \ + {MR_CMA, "cma"} + +TRACE_EVENT(mm_migrate_pages, + + TP_PROTO(unsigned long succeeded, unsigned long failed, + enum migrate_mode mode, int reason), + + TP_ARGS(succeeded, failed, mode, reason), + + TP_STRUCT__entry( + __field( unsigned long, succeeded) + __field( unsigned long, failed) + __field( enum migrate_mode, mode) + __field( int, reason) + ), + + TP_fast_assign( + __entry->succeeded = succeeded; + __entry->failed = failed; + __entry->mode = mode; + __entry->reason = reason; + ), + + TP_printk("nr_succeeded=%lu nr_failed=%lu mode=%s reason=%s", + __entry->succeeded, + __entry->failed, + __print_symbolic(__entry->mode, MIGRATE_MODE), + __print_symbolic(__entry->reason, MIGRATE_REASON)) +); + +#endif /* _TRACE_MIGRATE_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From a9c58b907dbc6821533dfc295b63caf111ff1f16 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 11 Dec 2012 16:02:54 -0800 Subject: mm, oom: change type of oom_score_adj to short The maximum oom_score_adj is 1000 and the minimum oom_score_adj is -1000, so this range can be represented by the signed short type with no functional change. The extra space this frees up in struct signal_struct will be used for per-thread oom kill flags in the next patch. Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Reviewed-by: Michal Hocko Cc: Anton Vorontsov Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/oom.h | 4 ++-- include/trace/events/task.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h index dd4ba3b92002..1e974983757e 100644 --- a/include/trace/events/oom.h +++ b/include/trace/events/oom.h @@ -14,7 +14,7 @@ TRACE_EVENT(oom_score_adj_update, TP_STRUCT__entry( __field( pid_t, pid) __array( char, comm, TASK_COMM_LEN ) - __field( int, oom_score_adj) + __field( short, oom_score_adj) ), TP_fast_assign( @@ -23,7 +23,7 @@ TRACE_EVENT(oom_score_adj_update, __entry->oom_score_adj = task->signal->oom_score_adj; ), - TP_printk("pid=%d comm=%s oom_score_adj=%d", + TP_printk("pid=%d comm=%s oom_score_adj=%hd", __entry->pid, __entry->comm, __entry->oom_score_adj) ); diff --git a/include/trace/events/task.h b/include/trace/events/task.h index b53add02e929..102a646e1996 100644 --- a/include/trace/events/task.h +++ b/include/trace/events/task.h @@ -15,7 +15,7 @@ TRACE_EVENT(task_newtask, __field( pid_t, pid) __array( char, comm, TASK_COMM_LEN) __field( unsigned long, clone_flags) - __field( int, oom_score_adj) + __field( short, oom_score_adj) ), TP_fast_assign( @@ -25,7 +25,7 @@ TRACE_EVENT(task_newtask, __entry->oom_score_adj = task->signal->oom_score_adj; ), - TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%d", + TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%hd", __entry->pid, __entry->comm, __entry->clone_flags, __entry->oom_score_adj) ); @@ -40,7 +40,7 @@ TRACE_EVENT(task_rename, __field( pid_t, pid) __array( char, oldcomm, TASK_COMM_LEN) __array( char, newcomm, TASK_COMM_LEN) - __field( int, oom_score_adj) + __field( short, oom_score_adj) ), TP_fast_assign( @@ -50,7 +50,7 @@ TRACE_EVENT(task_rename, __entry->oom_score_adj = task->signal->oom_score_adj; ), - TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%d", + TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%hd", __entry->pid, __entry->oldcomm, __entry->newcomm, __entry->oom_score_adj) ); -- cgit v1.2.3 From fb57dc817c24d46b035320d871b7a3fcc778558d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 30 Nov 2012 11:24:22 +0000 Subject: Btrfs: parse parent 0 into correct value in tracepoint Value 0 is not a tree id, so besides an upper limit, a lower limit is necessary as well while parsing root types of tracepoint. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- include/trace/events/btrfs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 54fab041b22a..ea546a4e9609 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -45,7 +45,8 @@ struct extent_buffer; #define show_root_type(obj) \ obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ - (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" + (obj >= BTRFS_ROOT_TREE_OBJECTID && \ + obj <= BTRFS_CSUM_TREE_OBJECTID)) ? __show_root_type(obj) : "-" #define BTRFS_GROUP_FLAGS \ { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \ -- cgit v1.2.3 From 7a64bf05b2a6fe3703062d13d389e3eb904741c6 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:21:51 -0800 Subject: mm: add a __GFP_KMEMCG flag This flag is used to indicate to the callees that this allocation is a kernel allocation in process context, and should be accounted to current's memcg. Signed-off-by: Glauber Costa Acked-by: Johannes Weiner Acked-by: Rik van Riel Acked-by: Mel Gorman Acked-by: Kamezawa Hiroyuki Acked-by: Michal Hocko Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Suleiman Souhlal Cc: Tejun Heo Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/gfpflags.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/trace') diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index d6fd8e5b14b7..1eddbf1557f2 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -34,6 +34,7 @@ {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ + {(unsigned long)__GFP_KMEMCG, "GFP_KMEMCG"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ -- cgit v1.2.3 From 4520fb3c3690f2643006d85f09ecb74554c10e95 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 25 Dec 2012 13:28:54 -0500 Subject: ext4: split off ext4_journalled_invalidatepage() In data=journal mode we don't need delalloc or DIO handling in invalidatepage and similarly in other modes we don't need the journal handling. So split invalidatepage implementations. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index f6372b011366..7e8c36bc7082 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -451,7 +451,7 @@ DEFINE_EVENT(ext4__page_op, ext4_releasepage, TP_ARGS(page) ); -TRACE_EVENT(ext4_invalidatepage, +DECLARE_EVENT_CLASS(ext4_invalidatepage_op, TP_PROTO(struct page *page, unsigned long offset), TP_ARGS(page, offset), @@ -477,6 +477,18 @@ TRACE_EVENT(ext4_invalidatepage, (unsigned long) __entry->index, __entry->offset) ); +DEFINE_EVENT(ext4_invalidatepage_op, ext4_invalidatepage, + TP_PROTO(struct page *page, unsigned long offset), + + TP_ARGS(page, offset) +); + +DEFINE_EVENT(ext4_invalidatepage_op, ext4_journalled_invalidatepage, + TP_PROTO(struct page *page, unsigned long offset), + + TP_ARGS(page, offset) +); + TRACE_EVENT(ext4_discard_blocks, TP_PROTO(struct super_block *sb, unsigned long long blk, unsigned long long count), -- cgit v1.2.3