diff options
Diffstat (limited to 'kernel/trace')
-rw-r--r-- | kernel/trace/Kconfig | 1 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 242 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 585 | ||||
-rw-r--r-- | kernel/trace/trace.c | 503 | ||||
-rw-r--r-- | kernel/trace/trace.h | 4 | ||||
-rw-r--r-- | kernel/trace/trace_printk.c | 4 |
6 files changed, 858 insertions, 481 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a1d2849f2473..d81a1a532994 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -141,7 +141,6 @@ if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER - select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0fa92f677c92..a008663d86c8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1383,44 +1383,73 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) static int ftrace_cmp_recs(const void *a, const void *b) { - const struct dyn_ftrace *reca = a; - const struct dyn_ftrace *recb = b; + const struct dyn_ftrace *key = a; + const struct dyn_ftrace *rec = b; - if (reca->ip > recb->ip) - return 1; - if (reca->ip < recb->ip) + if (key->flags < rec->ip) return -1; + if (key->ip >= rec->ip + MCOUNT_INSN_SIZE) + return 1; return 0; } -/** - * ftrace_location - return true if the ip giving is a traced location - * @ip: the instruction pointer to check - * - * Returns 1 if @ip given is a pointer to a ftrace location. - * That is, the instruction that is either a NOP or call to - * the function tracer. It checks the ftrace internal tables to - * determine if the address belongs or not. - */ -int ftrace_location(unsigned long ip) +static unsigned long ftrace_location_range(unsigned long start, unsigned long end) { struct ftrace_page *pg; struct dyn_ftrace *rec; struct dyn_ftrace key; - key.ip = ip; + key.ip = start; + key.flags = end; /* overload flags, as it is unsigned long */ for (pg = ftrace_pages_start; pg; pg = pg->next) { + if (end < pg->records[0].ip || + start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) + continue; rec = bsearch(&key, pg->records, pg->index, sizeof(struct dyn_ftrace), ftrace_cmp_recs); if (rec) - return 1; + return rec->ip; } return 0; } +/** + * ftrace_location - return true if the ip giving is a traced location + * @ip: the instruction pointer to check + * + * Returns rec->ip if @ip given is a pointer to a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +unsigned long ftrace_location(unsigned long ip) +{ + return ftrace_location_range(ip, ip); +} + +/** + * ftrace_text_reserved - return true if range contains an ftrace location + * @start: start of range to search + * @end: end of range to search (inclusive). @end points to the last byte to check. + * + * Returns 1 if @start and @end contains a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +int ftrace_text_reserved(void *start, void *end) +{ + unsigned long ret; + + ret = ftrace_location_range((unsigned long)start, + (unsigned long)end); + + return (int)!!ret; +} + static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1520,35 +1549,6 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, __ftrace_hash_rec_update(ops, filter_hash, 1); } -static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) -{ - if (ftrace_pages->index == ftrace_pages->size) { - /* We should have allocated enough */ - if (WARN_ON(!ftrace_pages->next)) - return NULL; - ftrace_pages = ftrace_pages->next; - } - - return &ftrace_pages->records[ftrace_pages->index++]; -} - -static struct dyn_ftrace * -ftrace_record_ip(unsigned long ip) -{ - struct dyn_ftrace *rec; - - if (ftrace_disabled) - return NULL; - - rec = ftrace_alloc_dyn_node(ip); - if (!rec) - return NULL; - - rec->ip = ip; - - return rec; -} - static void print_ip_ins(const char *fmt, unsigned char *p) { int i; @@ -1598,21 +1598,6 @@ void ftrace_bug(int failed, unsigned long ip) } } - -/* Return 1 if the address range is reserved for ftrace */ -int ftrace_text_reserved(void *start, void *end) -{ - struct dyn_ftrace *rec; - struct ftrace_page *pg; - - do_for_each_ftrace_rec(pg, rec) { - if (rec->ip <= (unsigned long)end && - rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start) - return 1; - } while_for_each_ftrace_rec(); - return 0; -} - static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) { unsigned long flag = 0UL; @@ -1698,7 +1683,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return -1; /* unknow ftrace bug */ } -static void ftrace_replace_code(int update) +void __weak ftrace_replace_code(int enable) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -1708,7 +1693,7 @@ static void ftrace_replace_code(int update) return; do_for_each_ftrace_rec(pg, rec) { - failed = __ftrace_replace_code(rec, update); + failed = __ftrace_replace_code(rec, enable); if (failed) { ftrace_bug(failed, rec->ip); /* Stop processing */ @@ -1826,22 +1811,27 @@ int __weak ftrace_arch_code_modify_post_process(void) return 0; } -static int __ftrace_modify_code(void *data) +void ftrace_modify_all_code(int command) { - int *command = data; - - if (*command & FTRACE_UPDATE_CALLS) + if (command & FTRACE_UPDATE_CALLS) ftrace_replace_code(1); - else if (*command & FTRACE_DISABLE_CALLS) + else if (command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); - if (*command & FTRACE_UPDATE_TRACE_FUNC) + if (command & FTRACE_UPDATE_TRACE_FUNC) ftrace_update_ftrace_func(ftrace_trace_function); - if (*command & FTRACE_START_FUNC_RET) + if (command & FTRACE_START_FUNC_RET) ftrace_enable_ftrace_graph_caller(); - else if (*command & FTRACE_STOP_FUNC_RET) + else if (command & FTRACE_STOP_FUNC_RET) ftrace_disable_ftrace_graph_caller(); +} + +static int __ftrace_modify_code(void *data) +{ + int *command = data; + + ftrace_modify_all_code(*command); return 0; } @@ -2469,57 +2459,35 @@ static int ftrace_avail_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; - int ret; if (unlikely(ftrace_disabled)) return -ENODEV; - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - - iter->pg = ftrace_pages_start; - iter->ops = &global_ops; - - ret = seq_open(file, &show_ftrace_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = iter; - } else { - kfree(iter); + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (iter) { + iter->pg = ftrace_pages_start; + iter->ops = &global_ops; } - return ret; + return iter ? 0 : -ENOMEM; } static int ftrace_enabled_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; - int ret; if (unlikely(ftrace_disabled)) return -ENODEV; - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - - iter->pg = ftrace_pages_start; - iter->flags = FTRACE_ITER_ENABLED; - iter->ops = &global_ops; - - ret = seq_open(file, &show_ftrace_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = iter; - } else { - kfree(iter); + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (iter) { + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ENABLED; + iter->ops = &global_ops; } - return ret; + return iter ? 0 : -ENOMEM; } static void ftrace_filter_reset(struct ftrace_hash *hash) @@ -3688,22 +3656,36 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) return 0; } -static void ftrace_swap_recs(void *a, void *b, int size) +static int ftrace_cmp_ips(const void *a, const void *b) +{ + const unsigned long *ipa = a; + const unsigned long *ipb = b; + + if (*ipa > *ipb) + return 1; + if (*ipa < *ipb) + return -1; + return 0; +} + +static void ftrace_swap_ips(void *a, void *b, int size) { - struct dyn_ftrace *reca = a; - struct dyn_ftrace *recb = b; - struct dyn_ftrace t; + unsigned long *ipa = a; + unsigned long *ipb = b; + unsigned long t; - t = *reca; - *reca = *recb; - *recb = t; + t = *ipa; + *ipa = *ipb; + *ipb = t; } static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { + struct ftrace_page *start_pg; struct ftrace_page *pg; + struct dyn_ftrace *rec; unsigned long count; unsigned long *p; unsigned long addr; @@ -3715,8 +3697,11 @@ static int ftrace_process_locs(struct module *mod, if (!count) return 0; - pg = ftrace_allocate_pages(count); - if (!pg) + sort(start, count, sizeof(*start), + ftrace_cmp_ips, ftrace_swap_ips); + + start_pg = ftrace_allocate_pages(count); + if (!start_pg) return -ENOMEM; mutex_lock(&ftrace_lock); @@ -3729,7 +3714,7 @@ static int ftrace_process_locs(struct module *mod, if (!mod) { WARN_ON(ftrace_pages || ftrace_pages_start); /* First initialization */ - ftrace_pages = ftrace_pages_start = pg; + ftrace_pages = ftrace_pages_start = start_pg; } else { if (!ftrace_pages) goto out; @@ -3740,11 +3725,11 @@ static int ftrace_process_locs(struct module *mod, ftrace_pages = ftrace_pages->next; } - ftrace_pages->next = pg; - ftrace_pages = pg; + ftrace_pages->next = start_pg; } p = start; + pg = start_pg; while (p < end) { addr = ftrace_call_adjust(*p++); /* @@ -3755,17 +3740,26 @@ static int ftrace_process_locs(struct module *mod, */ if (!addr) continue; - if (!ftrace_record_ip(addr)) - break; + + if (pg->index == pg->size) { + /* We should have allocated enough */ + if (WARN_ON(!pg->next)) + break; + pg = pg->next; + } + + rec = &pg->records[pg->index++]; + rec->ip = addr; } - /* These new locations need to be initialized */ - ftrace_new_pgs = pg; + /* We should have used all pages */ + WARN_ON(pg->next); + + /* Assign the last page to ftrace_pages */ + ftrace_pages = pg; - /* Make each individual set of pages sorted by ips */ - for (; pg; pg = pg->next) - sort(pg->records, pg->index, sizeof(struct dyn_ftrace), - ftrace_cmp_recs, ftrace_swap_recs); + /* These new locations need to be initialized */ + ftrace_new_pgs = start_pg; /* * We only need to disable interrupts on start up diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cf8d11e91efd..6420cda62336 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -23,6 +23,8 @@ #include <asm/local.h> #include "trace.h" +static void update_pages_handler(struct work_struct *work); + /* * The ring buffer header is special. We must manually up keep it. */ @@ -449,6 +451,7 @@ struct ring_buffer_per_cpu { raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; + unsigned int nr_pages; struct list_head *pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ @@ -466,13 +469,18 @@ struct ring_buffer_per_cpu { unsigned long read_bytes; u64 write_stamp; u64 read_stamp; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ + int nr_pages_to_update; + struct list_head new_pages; /* new pages to add */ + struct work_struct update_pages_work; + struct completion update_done; }; struct ring_buffer { - unsigned pages; unsigned flags; int cpus; atomic_t record_disabled; + atomic_t resize_disabled; cpumask_var_t cpumask; struct lock_class_key *reader_lock_key; @@ -937,6 +945,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; + /* Reset the head page if it exists */ + if (cpu_buffer->head_page) + rb_set_head_page(cpu_buffer); + rb_head_page_deactivate(cpu_buffer); if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) @@ -963,14 +975,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) return 0; } -static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, - unsigned nr_pages) +static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu) { + int i; struct buffer_page *bpage, *tmp; - LIST_HEAD(pages); - unsigned i; - - WARN_ON(!nr_pages); for (i = 0; i < nr_pages; i++) { struct page *page; @@ -981,15 +989,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL | __GFP_NORETRY, - cpu_to_node(cpu_buffer->cpu)); + cpu_to_node(cpu)); if (!bpage) goto free_pages; - rb_check_bpage(cpu_buffer, bpage); + list_add(&bpage->list, pages); - list_add(&bpage->list, &pages); - - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); if (!page) goto free_pages; @@ -997,6 +1003,27 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_init_page(bpage->page); } + return 0; + +free_pages: + list_for_each_entry_safe(bpage, tmp, pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + + return -ENOMEM; +} + +static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, + unsigned nr_pages) +{ + LIST_HEAD(pages); + + WARN_ON(!nr_pages); + + if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu)) + return -ENOMEM; + /* * The ring buffer page list is a circular list that does not * start and end with a list head. All page list items point to @@ -1005,20 +1032,15 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->pages = pages.next; list_del(&pages); + cpu_buffer->nr_pages = nr_pages; + rb_check_pages(cpu_buffer); return 0; - - free_pages: - list_for_each_entry_safe(bpage, tmp, &pages, list) { - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - return -ENOMEM; } static struct ring_buffer_per_cpu * -rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) +rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; @@ -1035,6 +1057,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) raw_spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); + init_completion(&cpu_buffer->update_done); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -1052,7 +1076,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) INIT_LIST_HEAD(&cpu_buffer->reader_page->list); - ret = rb_allocate_pages(cpu_buffer, buffer->pages); + ret = rb_allocate_pages(cpu_buffer, nr_pages); if (ret < 0) goto fail_free_reader; @@ -1113,7 +1137,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, { struct ring_buffer *buffer; int bsize; - int cpu; + int cpu, nr_pages; /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), @@ -1124,14 +1148,14 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; - buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); buffer->flags = flags; buffer->clock = trace_clock_local; buffer->reader_lock_key = key; /* need at least two pages */ - if (buffer->pages < 2) - buffer->pages = 2; + if (nr_pages < 2) + nr_pages = 2; /* * In case of non-hotplug cpu, if the ring-buffer is allocated @@ -1154,7 +1178,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, for_each_buffer_cpu(buffer, cpu) { buffer->buffers[cpu] = - rb_allocate_cpu_buffer(buffer, cpu); + rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) goto fail_free_buffers; } @@ -1222,58 +1246,222 @@ void ring_buffer_set_clock(struct ring_buffer *buffer, static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); -static void -rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) +static inline unsigned long rb_page_entries(struct buffer_page *bpage) { - struct buffer_page *bpage; - struct list_head *p; - unsigned i; + return local_read(&bpage->entries) & RB_WRITE_MASK; +} + +static inline unsigned long rb_page_write(struct buffer_page *bpage) +{ + return local_read(&bpage->write) & RB_WRITE_MASK; +} + +static int +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) +{ + struct list_head *tail_page, *to_remove, *next_page; + struct buffer_page *to_remove_page, *tmp_iter_page; + struct buffer_page *last_page, *first_page; + unsigned int nr_removed; + unsigned long head_bit; + int page_entries; + + head_bit = 0; raw_spin_lock_irq(&cpu_buffer->reader_lock); - rb_head_page_deactivate(cpu_buffer); + atomic_inc(&cpu_buffer->record_disabled); + /* + * We don't race with the readers since we have acquired the reader + * lock. We also don't race with writers after disabling recording. + * This makes it easy to figure out the first and the last page to be + * removed from the list. We unlink all the pages in between including + * the first and last pages. This is done in a busy loop so that we + * lose the least number of traces. + * The pages are freed after we restart recording and unlock readers. + */ + tail_page = &cpu_buffer->tail_page->list; - for (i = 0; i < nr_pages; i++) { - if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) - goto out; - p = cpu_buffer->pages->next; - bpage = list_entry(p, struct buffer_page, list); - list_del_init(&bpage->list); - free_buffer_page(bpage); + /* + * tail page might be on reader page, we remove the next page + * from the ring buffer + */ + if (cpu_buffer->tail_page == cpu_buffer->reader_page) + tail_page = rb_list_head(tail_page->next); + to_remove = tail_page; + + /* start of pages to remove */ + first_page = list_entry(rb_list_head(to_remove->next), + struct buffer_page, list); + + for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) { + to_remove = rb_list_head(to_remove)->next; + head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; } - if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) - goto out; - rb_reset_cpu(cpu_buffer); - rb_check_pages(cpu_buffer); + next_page = rb_list_head(to_remove)->next; -out: + /* + * Now we remove all pages between tail_page and next_page. + * Make sure that we have head_bit value preserved for the + * next page + */ + tail_page->next = (struct list_head *)((unsigned long)next_page | + head_bit); + next_page = rb_list_head(next_page); + next_page->prev = tail_page; + + /* make sure pages points to a valid page in the ring buffer */ + cpu_buffer->pages = next_page; + + /* update head page */ + if (head_bit) + cpu_buffer->head_page = list_entry(next_page, + struct buffer_page, list); + + /* + * change read pointer to make sure any read iterators reset + * themselves + */ + cpu_buffer->read = 0; + + /* pages are removed, resume tracing and then free the pages */ + atomic_dec(&cpu_buffer->record_disabled); raw_spin_unlock_irq(&cpu_buffer->reader_lock); + + RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)); + + /* last buffer page to remove */ + last_page = list_entry(rb_list_head(to_remove), struct buffer_page, + list); + tmp_iter_page = first_page; + + do { + to_remove_page = tmp_iter_page; + rb_inc_page(cpu_buffer, &tmp_iter_page); + + /* update the counters */ + page_entries = rb_page_entries(to_remove_page); + if (page_entries) { + /* + * If something was added to this page, it was full + * since it is not the tail page. So we deduct the + * bytes consumed in ring buffer from here. + * No need to update overruns, since this page is + * deleted from ring buffer and its entries are + * already accounted for. + */ + local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + } + + /* + * We have already removed references to this list item, just + * free up the buffer_page and its page + */ + free_buffer_page(to_remove_page); + nr_removed--; + + } while (to_remove_page != last_page); + + RB_WARN_ON(cpu_buffer, nr_removed); + + return nr_removed == 0; } -static void -rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, - struct list_head *pages, unsigned nr_pages) +static int +rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) { - struct buffer_page *bpage; - struct list_head *p; - unsigned i; + struct list_head *pages = &cpu_buffer->new_pages; + int retries, success; raw_spin_lock_irq(&cpu_buffer->reader_lock); - rb_head_page_deactivate(cpu_buffer); + /* + * We are holding the reader lock, so the reader page won't be swapped + * in the ring buffer. Now we are racing with the writer trying to + * move head page and the tail page. + * We are going to adapt the reader page update process where: + * 1. We first splice the start and end of list of new pages between + * the head page and its previous page. + * 2. We cmpxchg the prev_page->next to point from head page to the + * start of new pages list. + * 3. Finally, we update the head->prev to the end of new list. + * + * We will try this process 10 times, to make sure that we don't keep + * spinning. + */ + retries = 10; + success = 0; + while (retries--) { + struct list_head *head_page, *prev_page, *r; + struct list_head *last_page, *first_page; + struct list_head *head_page_with_bit; - for (i = 0; i < nr_pages; i++) { - if (RB_WARN_ON(cpu_buffer, list_empty(pages))) - goto out; - p = pages->next; - bpage = list_entry(p, struct buffer_page, list); - list_del_init(&bpage->list); - list_add_tail(&bpage->list, cpu_buffer->pages); + head_page = &rb_set_head_page(cpu_buffer)->list; + prev_page = head_page->prev; + + first_page = pages->next; + last_page = pages->prev; + + head_page_with_bit = (struct list_head *) + ((unsigned long)head_page | RB_PAGE_HEAD); + + last_page->next = head_page_with_bit; + first_page->prev = prev_page; + + r = cmpxchg(&prev_page->next, head_page_with_bit, first_page); + + if (r == head_page_with_bit) { + /* + * yay, we replaced the page pointer to our new list, + * now, we just have to update to head page's prev + * pointer to point to end of list + */ + head_page->prev = last_page; + success = 1; + break; + } } - rb_reset_cpu(cpu_buffer); - rb_check_pages(cpu_buffer); -out: + if (success) + INIT_LIST_HEAD(pages); + /* + * If we weren't successful in adding in new pages, warn and stop + * tracing + */ + RB_WARN_ON(cpu_buffer, !success); raw_spin_unlock_irq(&cpu_buffer->reader_lock); + + /* free pages if they weren't inserted */ + if (!success) { + struct buffer_page *bpage, *tmp; + list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, + list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + } + return success; +} + +static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer) +{ + int success; + + if (cpu_buffer->nr_pages_to_update > 0) + success = rb_insert_pages(cpu_buffer); + else + success = rb_remove_pages(cpu_buffer, + -cpu_buffer->nr_pages_to_update); + + if (success) + cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; +} + +static void update_pages_handler(struct work_struct *work) +{ + struct ring_buffer_per_cpu *cpu_buffer = container_of(work, + struct ring_buffer_per_cpu, update_pages_work); + rb_update_pages(cpu_buffer); + complete(&cpu_buffer->update_done); } /** @@ -1283,16 +1471,14 @@ out: * * Minimum size is 2 * BUF_PAGE_SIZE. * - * Returns -1 on failure. + * Returns 0 on success and < 0 on failure. */ -int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) +int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, + int cpu_id) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned nr_pages, rm_pages, new_pages; - struct buffer_page *bpage, *tmp; - unsigned long buffer_size; - LIST_HEAD(pages); - int i, cpu; + unsigned nr_pages; + int cpu, err = 0; /* * Always succeed at resizing a non-existent buffer: @@ -1302,113 +1488,154 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); size *= BUF_PAGE_SIZE; - buffer_size = buffer->pages * BUF_PAGE_SIZE; /* we need a minimum of two pages */ if (size < BUF_PAGE_SIZE * 2) size = BUF_PAGE_SIZE * 2; - if (size == buffer_size) - return size; - - atomic_inc(&buffer->record_disabled); + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); - /* Make sure all writers are done with this buffer. */ - synchronize_sched(); + /* + * Don't succeed if resizing is disabled, as a reader might be + * manipulating the ring buffer and is expecting a sane state while + * this is true. + */ + if (atomic_read(&buffer->resize_disabled)) + return -EBUSY; + /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); - get_online_cpus(); - - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); - if (size < buffer_size) { + if (cpu_id == RING_BUFFER_ALL_CPUS) { + /* calculate the pages to update */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; - /* easy case, just free pages */ - if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) - goto out_fail; + cpu_buffer->nr_pages_to_update = nr_pages - + cpu_buffer->nr_pages; + /* + * nothing more to do for removing pages or no update + */ + if (cpu_buffer->nr_pages_to_update <= 0) + continue; + /* + * to add pages, make sure all new pages can be + * allocated without receiving ENOMEM + */ + INIT_LIST_HEAD(&cpu_buffer->new_pages); + if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update, + &cpu_buffer->new_pages, cpu)) { + /* not enough memory for new pages */ + err = -ENOMEM; + goto out_err; + } + } - rm_pages = buffer->pages - nr_pages; + get_online_cpus(); + /* + * Fire off all the required work handlers + * We can't schedule on offline CPUs, but it's not necessary + * since we can change their buffer sizes without any race. + */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + if (!cpu_buffer->nr_pages_to_update) + continue; + + if (cpu_online(cpu)) + schedule_work_on(cpu, + &cpu_buffer->update_pages_work); + else + rb_update_pages(cpu_buffer); + } + /* wait for all the updates to complete */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - rb_remove_pages(cpu_buffer, rm_pages); + if (!cpu_buffer->nr_pages_to_update) + continue; + + if (cpu_online(cpu)) + wait_for_completion(&cpu_buffer->update_done); + cpu_buffer->nr_pages_to_update = 0; } - goto out; - } - /* - * This is a bit more difficult. We only want to add pages - * when we can allocate enough for all CPUs. We do this - * by allocating all the pages and storing them on a local - * link list. If we succeed in our allocation, then we - * add these pages to the cpu_buffers. Otherwise we just free - * them all and return -ENOMEM; - */ - if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) - goto out_fail; + put_online_cpus(); + } else { + cpu_buffer = buffer->buffers[cpu_id]; - new_pages = nr_pages - buffer->pages; + if (nr_pages == cpu_buffer->nr_pages) + goto out; - for_each_buffer_cpu(buffer, cpu) { - for (i = 0; i < new_pages; i++) { - struct page *page; - /* - * __GFP_NORETRY flag makes sure that the allocation - * fails gracefully without invoking oom-killer and - * the system is not destabilized. - */ - bpage = kzalloc_node(ALIGN(sizeof(*bpage), - cache_line_size()), - GFP_KERNEL | __GFP_NORETRY, - cpu_to_node(cpu)); - if (!bpage) - goto free_pages; - list_add(&bpage->list, &pages); - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY, 0); - if (!page) - goto free_pages; - bpage->page = page_address(page); - rb_init_page(bpage->page); + cpu_buffer->nr_pages_to_update = nr_pages - + cpu_buffer->nr_pages; + + INIT_LIST_HEAD(&cpu_buffer->new_pages); + if (cpu_buffer->nr_pages_to_update > 0 && + __rb_allocate_pages(cpu_buffer->nr_pages_to_update, + &cpu_buffer->new_pages, cpu_id)) { + err = -ENOMEM; + goto out_err; } - } - for_each_buffer_cpu(buffer, cpu) { - cpu_buffer = buffer->buffers[cpu]; - rb_insert_pages(cpu_buffer, &pages, new_pages); - } + get_online_cpus(); - if (RB_WARN_ON(buffer, !list_empty(&pages))) - goto out_fail; + if (cpu_online(cpu_id)) { + schedule_work_on(cpu_id, + &cpu_buffer->update_pages_work); + wait_for_completion(&cpu_buffer->update_done); + } else + rb_update_pages(cpu_buffer); + + cpu_buffer->nr_pages_to_update = 0; + put_online_cpus(); + } out: - buffer->pages = nr_pages; - put_online_cpus(); + /* + * The ring buffer resize can happen with the ring buffer + * enabled, so that the update disturbs the tracing as little + * as possible. But if the buffer is disabled, we do not need + * to worry about that, and we can take the time to verify + * that the buffer is not corrupt. + */ + if (atomic_read(&buffer->record_disabled)) { + atomic_inc(&buffer->record_disabled); + /* + * Even though the buffer was disabled, we must make sure + * that it is truly disabled before calling rb_check_pages. + * There could have been a race between checking + * record_disable and incrementing it. + */ + synchronize_sched(); + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + rb_check_pages(cpu_buffer); + } + atomic_dec(&buffer->record_disabled); + } + mutex_unlock(&buffer->mutex); + return size; - atomic_dec(&buffer->record_disabled); + out_err: + for_each_buffer_cpu(buffer, cpu) { + struct buffer_page *bpage, *tmp; - return size; + cpu_buffer = buffer->buffers[cpu]; + cpu_buffer->nr_pages_to_update = 0; - free_pages: - list_for_each_entry_safe(bpage, tmp, &pages, list) { - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - put_online_cpus(); - mutex_unlock(&buffer->mutex); - atomic_dec(&buffer->record_disabled); - return -ENOMEM; + if (list_empty(&cpu_buffer->new_pages)) + continue; - /* - * Something went totally wrong, and we are too paranoid - * to even clean up the mess. - */ - out_fail: - put_online_cpus(); + list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, + list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + } mutex_unlock(&buffer->mutex); - atomic_dec(&buffer->record_disabled); - return -1; + return err; } EXPORT_SYMBOL_GPL(ring_buffer_resize); @@ -1447,21 +1674,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter) return __rb_page_index(iter->head_page, iter->head); } -static inline unsigned long rb_page_write(struct buffer_page *bpage) -{ - return local_read(&bpage->write) & RB_WRITE_MASK; -} - static inline unsigned rb_page_commit(struct buffer_page *bpage) { return local_read(&bpage->page->commit); } -static inline unsigned long rb_page_entries(struct buffer_page *bpage) -{ - return local_read(&bpage->entries) & RB_WRITE_MASK; -} - /* Size is determined by what has been committed */ static inline unsigned rb_page_size(struct buffer_page *bpage) { @@ -1510,7 +1727,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * assign the commit to the tail. */ again: - max_count = cpu_buffer->buffer->pages * 100; + max_count = cpu_buffer->nr_pages * 100; while (cpu_buffer->commit_page != cpu_buffer->tail_page) { if (RB_WARN_ON(cpu_buffer, !(--max_count))) @@ -3486,6 +3703,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) iter->cpu_buffer = cpu_buffer; + atomic_inc(&buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); return iter; @@ -3548,7 +3766,14 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + /* + * Ring buffer is disabled from recording, here's a good place + * to check the integrity of the ring buffer. + */ + rb_check_pages(cpu_buffer); + atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&cpu_buffer->buffer->resize_disabled); kfree(iter); } EXPORT_SYMBOL_GPL(ring_buffer_read_finish); @@ -3588,9 +3813,18 @@ EXPORT_SYMBOL_GPL(ring_buffer_read); * ring_buffer_size - return the size of the ring buffer (in bytes) * @buffer: The ring buffer. */ -unsigned long ring_buffer_size(struct ring_buffer *buffer) +unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu) { - return BUF_PAGE_SIZE * buffer->pages; + /* + * Earlier, this method returned + * BUF_PAGE_SIZE * buffer->nr_pages + * Since the nr_pages field is now removed, we have converted this to + * return the per cpu buffer value. + */ + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages; } EXPORT_SYMBOL_GPL(ring_buffer_size); @@ -3611,6 +3845,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->commit_page = cpu_buffer->head_page; INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + INIT_LIST_HEAD(&cpu_buffer->new_pages); local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); @@ -3647,8 +3882,12 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; + atomic_inc(&buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); + /* Make sure all commits have finished */ + synchronize_sched(); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) @@ -3664,6 +3903,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&buffer->resize_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -3765,8 +4005,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, !cpumask_test_cpu(cpu, buffer_b->cpumask)) goto out; + cpu_buffer_a = buffer_a->buffers[cpu]; + cpu_buffer_b = buffer_b->buffers[cpu]; + /* At least make sure the two buffers are somewhat the same */ - if (buffer_a->pages != buffer_b->pages) + if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; ret = -EAGAIN; @@ -3780,9 +4023,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, if (atomic_read(&buffer_b->record_disabled)) goto out; - cpu_buffer_a = buffer_a->buffers[cpu]; - cpu_buffer_b = buffer_b->buffers[cpu]; - if (atomic_read(&cpu_buffer_a->record_disabled)) goto out; @@ -4071,6 +4311,8 @@ static int rb_cpu_notify(struct notifier_block *self, struct ring_buffer *buffer = container_of(self, struct ring_buffer, cpu_notify); long cpu = (long)hcpu; + int cpu_i, nr_pages_same; + unsigned int nr_pages; switch (action) { case CPU_UP_PREPARE: @@ -4078,8 +4320,23 @@ static int rb_cpu_notify(struct notifier_block *self, if (cpumask_test_cpu(cpu, buffer->cpumask)) return NOTIFY_OK; + nr_pages = 0; + nr_pages_same = 1; + /* check if all cpu sizes are same */ + for_each_buffer_cpu(buffer, cpu_i) { + /* fill in the size from first enabled cpu */ + if (nr_pages == 0) + nr_pages = buffer->buffers[cpu_i]->nr_pages; + if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { + nr_pages_same = 0; + break; + } + } + /* allocate minimum pages, user can later expand it */ + if (!nr_pages_same) + nr_pages = 2; buffer->buffers[cpu] = - rb_allocate_cpu_buffer(buffer, cpu); + rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) { WARN(1, "failed to allocate ring buffer on CPU %ld\n", cpu); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2a22255c1010..68032c6177db 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -87,18 +87,6 @@ static int tracing_disabled = 1; DEFINE_PER_CPU(int, ftrace_cpu_disabled); -static inline void ftrace_disable_cpu(void) -{ - preempt_disable(); - __this_cpu_inc(ftrace_cpu_disabled); -} - -static inline void ftrace_enable_cpu(void) -{ - __this_cpu_dec(ftrace_cpu_disabled); - preempt_enable(); -} - cpumask_var_t __read_mostly tracing_buffer_mask; /* @@ -629,7 +617,6 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; - void *ret; if (s->len <= s->readpos) return -EBUSY; @@ -637,9 +624,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) len = s->len - s->readpos; if (cnt > len) cnt = len; - ret = memcpy(buf, s->buffer + s->readpos, cnt); - if (!ret) - return -EFAULT; + memcpy(buf, s->buffer + s->readpos, cnt); s->readpos += cnt; return cnt; @@ -751,8 +736,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&ftrace_max_lock); - ftrace_disable_cpu(); - ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); if (ret == -EBUSY) { @@ -766,8 +749,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) "Failed to swap buffers due to commit in progress\n"); } - ftrace_enable_cpu(); - WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); @@ -782,8 +763,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) * Register a new plugin tracer. */ int register_tracer(struct tracer *type) -__releases(kernel_lock) -__acquires(kernel_lock) { struct tracer *t; int ret = 0; @@ -841,7 +820,8 @@ __acquires(kernel_lock) /* If we expanded the buffers, make sure the max is expanded too */ if (ring_buffer_expanded && type->use_max_tr) - ring_buffer_resize(max_tr.buffer, trace_buf_size); + ring_buffer_resize(max_tr.buffer, trace_buf_size, + RING_BUFFER_ALL_CPUS); /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); @@ -857,7 +837,8 @@ __acquires(kernel_lock) /* Shrink the max buffer again */ if (ring_buffer_expanded && type->use_max_tr) - ring_buffer_resize(max_tr.buffer, 1); + ring_buffer_resize(max_tr.buffer, 1, + RING_BUFFER_ALL_CPUS); printk(KERN_CONT "PASSED\n"); } @@ -917,13 +898,6 @@ out: mutex_unlock(&trace_types_lock); } -static void __tracing_reset(struct ring_buffer *buffer, int cpu) -{ - ftrace_disable_cpu(); - ring_buffer_reset_cpu(buffer, cpu); - ftrace_enable_cpu(); -} - void tracing_reset(struct trace_array *tr, int cpu) { struct ring_buffer *buffer = tr->buffer; @@ -932,7 +906,7 @@ void tracing_reset(struct trace_array *tr, int cpu) /* Make sure all commits have finished */ synchronize_sched(); - __tracing_reset(buffer, cpu); + ring_buffer_reset_cpu(buffer, cpu); ring_buffer_record_enable(buffer); } @@ -950,7 +924,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - __tracing_reset(buffer, cpu); + ring_buffer_reset_cpu(buffer, cpu); ring_buffer_record_enable(buffer); } @@ -1498,25 +1472,119 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) #endif /* CONFIG_STACKTRACE */ +/* created for use with alloc_percpu */ +struct trace_buffer_struct { + char buffer[TRACE_BUF_SIZE]; +}; + +static struct trace_buffer_struct *trace_percpu_buffer; +static struct trace_buffer_struct *trace_percpu_sirq_buffer; +static struct trace_buffer_struct *trace_percpu_irq_buffer; +static struct trace_buffer_struct *trace_percpu_nmi_buffer; + +/* + * The buffer used is dependent on the context. There is a per cpu + * buffer for normal context, softirq contex, hard irq context and + * for NMI context. Thise allows for lockless recording. + * + * Note, if the buffers failed to be allocated, then this returns NULL + */ +static char *get_trace_buf(void) +{ + struct trace_buffer_struct *percpu_buffer; + struct trace_buffer_struct *buffer; + + /* + * If we have allocated per cpu buffers, then we do not + * need to do any locking. + */ + if (in_nmi()) + percpu_buffer = trace_percpu_nmi_buffer; + else if (in_irq()) + percpu_buffer = trace_percpu_irq_buffer; + else if (in_softirq()) + percpu_buffer = trace_percpu_sirq_buffer; + else + percpu_buffer = trace_percpu_buffer; + + if (!percpu_buffer) + return NULL; + + buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); + + return buffer->buffer; +} + +static int alloc_percpu_trace_buffer(void) +{ + struct trace_buffer_struct *buffers; + struct trace_buffer_struct *sirq_buffers; + struct trace_buffer_struct *irq_buffers; + struct trace_buffer_struct *nmi_buffers; + + buffers = alloc_percpu(struct trace_buffer_struct); + if (!buffers) + goto err_warn; + + sirq_buffers = alloc_percpu(struct trace_buffer_struct); + if (!sirq_buffers) + goto err_sirq; + + irq_buffers = alloc_percpu(struct trace_buffer_struct); + if (!irq_buffers) + goto err_irq; + + nmi_buffers = alloc_percpu(struct trace_buffer_struct); + if (!nmi_buffers) + goto err_nmi; + + trace_percpu_buffer = buffers; + trace_percpu_sirq_buffer = sirq_buffers; + trace_percpu_irq_buffer = irq_buffers; + trace_percpu_nmi_buffer = nmi_buffers; + + return 0; + + err_nmi: + free_percpu(irq_buffers); + err_irq: + free_percpu(sirq_buffers); + err_sirq: + free_percpu(buffers); + err_warn: + WARN(1, "Could not allocate percpu trace_printk buffer"); + return -ENOMEM; +} + +void trace_printk_init_buffers(void) +{ + static int buffers_allocated; + + if (buffers_allocated) + return; + + if (alloc_percpu_trace_buffer()) + return; + + pr_info("ftrace: Allocated trace_printk buffers\n"); + + buffers_allocated = 1; +} + /** * trace_vbprintk - write binary msg to tracing buffer * */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { - static arch_spinlock_t trace_buf_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - static u32 trace_buf[TRACE_BUF_SIZE]; - struct ftrace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct ring_buffer *buffer; struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; struct bprint_entry *entry; unsigned long flags; - int disable; - int cpu, len = 0, size, pc; + char *tbuffer; + int len = 0, size, pc; if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; @@ -1526,43 +1594,36 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) pc = preempt_count(); preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disable = atomic_inc_return(&data->disabled); - if (unlikely(disable != 1)) + tbuffer = get_trace_buf(); + if (!tbuffer) { + len = 0; goto out; + } - /* Lockdep uses trace_printk for lock tracing */ - local_irq_save(flags); - arch_spin_lock(&trace_buf_lock); - len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); + len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); - if (len > TRACE_BUF_SIZE || len < 0) - goto out_unlock; + if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) + goto out; + local_save_flags(flags); size = sizeof(*entry) + sizeof(u32) * len; buffer = tr->buffer; event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, flags, pc); if (!event) - goto out_unlock; + goto out; entry = ring_buffer_event_data(event); entry->ip = ip; entry->fmt = fmt; - memcpy(entry->buf, trace_buf, sizeof(u32) * len); + memcpy(entry->buf, tbuffer, sizeof(u32) * len); if (!filter_check_discard(call, entry, buffer, event)) { ring_buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); } -out_unlock: - arch_spin_unlock(&trace_buf_lock); - local_irq_restore(flags); - out: - atomic_dec_return(&data->disabled); preempt_enable_notrace(); unpause_graph_tracing(); @@ -1588,58 +1649,53 @@ int trace_array_printk(struct trace_array *tr, int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { - static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED; - static char trace_buf[TRACE_BUF_SIZE]; - struct ftrace_event_call *call = &event_print; struct ring_buffer_event *event; struct ring_buffer *buffer; - struct trace_array_cpu *data; - int cpu, len = 0, size, pc; + int len = 0, size, pc; struct print_entry *entry; - unsigned long irq_flags; - int disable; + unsigned long flags; + char *tbuffer; if (tracing_disabled || tracing_selftest_running) return 0; + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); + pc = preempt_count(); preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disable = atomic_inc_return(&data->disabled); - if (unlikely(disable != 1)) + + tbuffer = get_trace_buf(); + if (!tbuffer) { + len = 0; goto out; + } - pause_graph_tracing(); - raw_local_irq_save(irq_flags); - arch_spin_lock(&trace_buf_lock); - len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); + len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); + if (len > TRACE_BUF_SIZE) + goto out; + local_save_flags(flags); size = sizeof(*entry) + len + 1; buffer = tr->buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - irq_flags, pc); + flags, pc); if (!event) - goto out_unlock; + goto out; entry = ring_buffer_event_data(event); entry->ip = ip; - memcpy(&entry->buf, trace_buf, len); + memcpy(&entry->buf, tbuffer, len); entry->buf[len] = '\0'; if (!filter_check_discard(call, entry, buffer, event)) { ring_buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, irq_flags, 6, pc); + ftrace_trace_stack(buffer, flags, 6, pc); } - - out_unlock: - arch_spin_unlock(&trace_buf_lock); - raw_local_irq_restore(irq_flags); - unpause_graph_tracing(); out: - atomic_dec_return(&data->disabled); preempt_enable_notrace(); + unpause_graph_tracing(); return len; } @@ -1652,14 +1708,9 @@ EXPORT_SYMBOL_GPL(trace_vprintk); static void trace_iterator_increment(struct trace_iterator *iter) { - /* Don't allow ftrace to trace into the ring buffers */ - ftrace_disable_cpu(); - iter->idx++; if (iter->buffer_iter[iter->cpu]) ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); - - ftrace_enable_cpu(); } static struct trace_entry * @@ -1669,17 +1720,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; - /* Don't allow ftrace to trace into the ring buffers */ - ftrace_disable_cpu(); - if (buf_iter) event = ring_buffer_iter_peek(buf_iter, ts); else event = ring_buffer_peek(iter->tr->buffer, cpu, ts, lost_events); - ftrace_enable_cpu(); - if (event) { iter->ent_size = ring_buffer_event_length(event); return ring_buffer_event_data(event); @@ -1769,11 +1815,8 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter) static void trace_consume(struct trace_iterator *iter) { - /* Don't allow ftrace to trace into the ring buffers */ - ftrace_disable_cpu(); ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, &iter->lost_events); - ftrace_enable_cpu(); } static void *s_next(struct seq_file *m, void *v, loff_t *pos) @@ -1862,16 +1905,12 @@ static void *s_start(struct seq_file *m, loff_t *pos) iter->cpu = 0; iter->idx = -1; - ftrace_disable_cpu(); - if (cpu_file == TRACE_PIPE_ALL_CPU) { for_each_tracing_cpu(cpu) tracing_iter_reset(iter, cpu); } else tracing_iter_reset(iter, cpu_file); - ftrace_enable_cpu(); - iter->leftover = 0; for (p = iter; p && l < *pos; p = s_next(m, p, &l)) ; @@ -2332,15 +2371,13 @@ static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file) { long cpu_file = (long) inode->i_private; - void *fail_ret = ERR_PTR(-ENOMEM); struct trace_iterator *iter; - struct seq_file *m; - int cpu, ret; + int cpu; if (tracing_disabled) return ERR_PTR(-ENODEV); - iter = kzalloc(sizeof(*iter), GFP_KERNEL); + iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter)); if (!iter) return ERR_PTR(-ENOMEM); @@ -2397,32 +2434,15 @@ __tracing_open(struct inode *inode, struct file *file) tracing_iter_reset(iter, cpu); } - ret = seq_open(file, &tracer_seq_ops); - if (ret < 0) { - fail_ret = ERR_PTR(ret); - goto fail_buffer; - } - - m = file->private_data; - m->private = iter; - mutex_unlock(&trace_types_lock); return iter; - fail_buffer: - for_each_tracing_cpu(cpu) { - if (iter->buffer_iter[cpu]) - ring_buffer_read_finish(iter->buffer_iter[cpu]); - } - free_cpumask_var(iter->started); - tracing_start(); fail: mutex_unlock(&trace_types_lock); kfree(iter->trace); - kfree(iter); - - return fail_ret; + seq_release_private(inode, file); + return ERR_PTR(-ENOMEM); } int tracing_open_generic(struct inode *inode, struct file *filp) @@ -2458,11 +2478,10 @@ static int tracing_release(struct inode *inode, struct file *file) tracing_start(); mutex_unlock(&trace_types_lock); - seq_release(inode, file); mutex_destroy(&iter->mutex); free_cpumask_var(iter->started); kfree(iter->trace); - kfree(iter); + seq_release_private(inode, file); return 0; } @@ -2648,10 +2667,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, if (cpumask_test_cpu(cpu, tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_inc(&global_trace.data[cpu]->disabled); + ring_buffer_record_disable_cpu(global_trace.buffer, cpu); } if (!cpumask_test_cpu(cpu, tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_dec(&global_trace.data[cpu]->disabled); + ring_buffer_record_enable_cpu(global_trace.buffer, cpu); } } arch_spin_unlock(&ftrace_max_lock); @@ -2974,7 +2995,14 @@ int tracer_init(struct tracer *t, struct trace_array *tr) return t->init(tr); } -static int __tracing_resize_ring_buffer(unsigned long size) +static void set_buffer_entries(struct trace_array *tr, unsigned long val) +{ + int cpu; + for_each_tracing_cpu(cpu) + tr->data[cpu]->entries = val; +} + +static int __tracing_resize_ring_buffer(unsigned long size, int cpu) { int ret; @@ -2985,19 +3013,32 @@ static int __tracing_resize_ring_buffer(unsigned long size) */ ring_buffer_expanded = 1; - ret = ring_buffer_resize(global_trace.buffer, size); + ret = ring_buffer_resize(global_trace.buffer, size, cpu); if (ret < 0) return ret; if (!current_trace->use_max_tr) goto out; - ret = ring_buffer_resize(max_tr.buffer, size); + ret = ring_buffer_resize(max_tr.buffer, size, cpu); if (ret < 0) { - int r; + int r = 0; + + if (cpu == RING_BUFFER_ALL_CPUS) { + int i; + for_each_tracing_cpu(i) { + r = ring_buffer_resize(global_trace.buffer, + global_trace.data[i]->entries, + i); + if (r < 0) + break; + } + } else { + r = ring_buffer_resize(global_trace.buffer, + global_trace.data[cpu]->entries, + cpu); + } - r = ring_buffer_resize(global_trace.buffer, - global_trace.entries); if (r < 0) { /* * AARGH! We are left with different @@ -3019,43 +3060,39 @@ static int __tracing_resize_ring_buffer(unsigned long size) return ret; } - max_tr.entries = size; + if (cpu == RING_BUFFER_ALL_CPUS) + set_buffer_entries(&max_tr, size); + else + max_tr.data[cpu]->entries = size; + out: - global_trace.entries = size; + if (cpu == RING_BUFFER_ALL_CPUS) + set_buffer_entries(&global_trace, size); + else + global_trace.data[cpu]->entries = size; return ret; } -static ssize_t tracing_resize_ring_buffer(unsigned long size) +static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) { - int cpu, ret = size; + int ret = size; mutex_lock(&trace_types_lock); - tracing_stop(); - - /* disable all cpu buffers */ - for_each_tracing_cpu(cpu) { - if (global_trace.data[cpu]) - atomic_inc(&global_trace.data[cpu]->disabled); - if (max_tr.data[cpu]) - atomic_inc(&max_tr.data[cpu]->disabled); + if (cpu_id != RING_BUFFER_ALL_CPUS) { + /* make sure, this cpu is enabled in the mask */ + if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { + ret = -EINVAL; + goto out; + } } - if (size != global_trace.entries) - ret = __tracing_resize_ring_buffer(size); - + ret = __tracing_resize_ring_buffer(size, cpu_id); if (ret < 0) ret = -ENOMEM; - for_each_tracing_cpu(cpu) { - if (global_trace.data[cpu]) - atomic_dec(&global_trace.data[cpu]->disabled); - if (max_tr.data[cpu]) - atomic_dec(&max_tr.data[cpu]->disabled); - } - - tracing_start(); +out: mutex_unlock(&trace_types_lock); return ret; @@ -3078,7 +3115,8 @@ int tracing_update_buffers(void) mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) - ret = __tracing_resize_ring_buffer(trace_buf_size); + ret = __tracing_resize_ring_buffer(trace_buf_size, + RING_BUFFER_ALL_CPUS); mutex_unlock(&trace_types_lock); return ret; @@ -3102,7 +3140,8 @@ static int tracing_set_tracer(const char *buf) mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) { - ret = __tracing_resize_ring_buffer(trace_buf_size); + ret = __tracing_resize_ring_buffer(trace_buf_size, + RING_BUFFER_ALL_CPUS); if (ret < 0) goto out; ret = 0; @@ -3128,8 +3167,8 @@ static int tracing_set_tracer(const char *buf) * The max_tr ring buffer has some state (e.g. ring->clock) and * we want preserve it. */ - ring_buffer_resize(max_tr.buffer, 1); - max_tr.entries = 1; + ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); + set_buffer_entries(&max_tr, 1); } destroy_trace_option_files(topts); @@ -3137,10 +3176,17 @@ static int tracing_set_tracer(const char *buf) topts = create_trace_option_files(current_trace); if (current_trace->use_max_tr) { - ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); - if (ret < 0) - goto out; - max_tr.entries = global_trace.entries; + int cpu; + /* we need to make per cpu buffer sizes equivalent */ + for_each_tracing_cpu(cpu) { + ret = ring_buffer_resize(max_tr.buffer, + global_trace.data[cpu]->entries, + cpu); + if (ret < 0) + goto out; + max_tr.data[cpu]->entries = + global_trace.data[cpu]->entries; + } } if (t->init) { @@ -3642,30 +3688,82 @@ out_err: goto out; } +struct ftrace_entries_info { + struct trace_array *tr; + int cpu; +}; + +static int tracing_entries_open(struct inode *inode, struct file *filp) +{ + struct ftrace_entries_info *info; + + if (tracing_disabled) + return -ENODEV; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + info->tr = &global_trace; + info->cpu = (unsigned long)inode->i_private; + + filp->private_data = info; + + return 0; +} + static ssize_t tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_array *tr = filp->private_data; - char buf[96]; - int r; + struct ftrace_entries_info *info = filp->private_data; + struct trace_array *tr = info->tr; + char buf[64]; + int r = 0; + ssize_t ret; mutex_lock(&trace_types_lock); - if (!ring_buffer_expanded) - r = sprintf(buf, "%lu (expanded: %lu)\n", - tr->entries >> 10, - trace_buf_size >> 10); - else - r = sprintf(buf, "%lu\n", tr->entries >> 10); + + if (info->cpu == RING_BUFFER_ALL_CPUS) { + int cpu, buf_size_same; + unsigned long size; + + size = 0; + buf_size_same = 1; + /* check if all cpu sizes are same */ + for_each_tracing_cpu(cpu) { + /* fill in the size from first enabled cpu */ + if (size == 0) + size = tr->data[cpu]->entries; + if (size != tr->data[cpu]->entries) { + buf_size_same = 0; + break; + } + } + + if (buf_size_same) { + if (!ring_buffer_expanded) + r = sprintf(buf, "%lu (expanded: %lu)\n", + size >> 10, + trace_buf_size >> 10); + else + r = sprintf(buf, "%lu\n", size >> 10); + } else + r = sprintf(buf, "X\n"); + } else + r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); + mutex_unlock(&trace_types_lock); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + return ret; } static ssize_t tracing_entries_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct ftrace_entries_info *info = filp->private_data; unsigned long val; int ret; @@ -3680,7 +3778,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, /* value is in KB */ val <<= 10; - ret = tracing_resize_ring_buffer(val); + ret = tracing_resize_ring_buffer(val, info->cpu); if (ret < 0) return ret; @@ -3689,6 +3787,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, return cnt; } +static int +tracing_entries_release(struct inode *inode, struct file *filp) +{ + struct ftrace_entries_info *info = filp->private_data; + + kfree(info); + + return 0; +} + static ssize_t tracing_total_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3700,7 +3808,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { - size += tr->entries >> 10; + size += tr->data[cpu]->entries >> 10; if (!ring_buffer_expanded) expanded_size += trace_buf_size >> 10; } @@ -3734,7 +3842,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) if (trace_flags & TRACE_ITER_STOP_ON_FREE) tracing_off(); /* resize the ring buffer to 0 */ - tracing_resize_ring_buffer(0); + tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); return 0; } @@ -3749,14 +3857,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, struct print_entry *entry; unsigned long irq_flags; struct page *pages[2]; + void *map_page[2]; int nr_pages = 1; ssize_t written; - void *page1; - void *page2; int offset; int size; int len; int ret; + int i; if (tracing_disabled) return -EINVAL; @@ -3795,9 +3903,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, goto out; } - page1 = kmap_atomic(pages[0]); - if (nr_pages == 2) - page2 = kmap_atomic(pages[1]); + for (i = 0; i < nr_pages; i++) + map_page[i] = kmap_atomic(pages[i]); local_save_flags(irq_flags); size = sizeof(*entry) + cnt + 2; /* possible \n added */ @@ -3815,10 +3922,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (nr_pages == 2) { len = PAGE_SIZE - offset; - memcpy(&entry->buf, page1 + offset, len); - memcpy(&entry->buf[len], page2, cnt - len); + memcpy(&entry->buf, map_page[0] + offset, len); + memcpy(&entry->buf[len], map_page[1], cnt - len); } else - memcpy(&entry->buf, page1 + offset, cnt); + memcpy(&entry->buf, map_page[0] + offset, cnt); if (entry->buf[cnt - 1] != '\n') { entry->buf[cnt] = '\n'; @@ -3833,11 +3940,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, *fpos += written; out_unlock: - if (nr_pages == 2) - kunmap_atomic(page2); - kunmap_atomic(page1); - while (nr_pages > 0) - put_page(pages[--nr_pages]); + for (i = 0; i < nr_pages; i++){ + kunmap_atomic(map_page[i]); + put_page(pages[i]); + } out: return written; } @@ -3933,9 +4039,10 @@ static const struct file_operations tracing_pipe_fops = { }; static const struct file_operations tracing_entries_fops = { - .open = tracing_open_generic, + .open = tracing_entries_open, .read = tracing_entries_read, .write = tracing_entries_write, + .release = tracing_entries_release, .llseek = generic_file_llseek, }; @@ -4367,6 +4474,9 @@ static void tracing_init_debugfs_percpu(long cpu) struct dentry *d_cpu; char cpu_dir[30]; /* 30 characters should be more than enough */ + if (!d_percpu) + return; + snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = debugfs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { @@ -4387,6 +4497,9 @@ static void tracing_init_debugfs_percpu(long cpu) trace_create_file("stats", 0444, d_cpu, (void *) cpu, &tracing_stats_fops); + + trace_create_file("buffer_size_kb", 0444, d_cpu, + (void *) cpu, &tracing_entries_fops); } #ifdef CONFIG_FTRACE_SELFTEST @@ -4718,7 +4831,7 @@ static __init int tracer_init_debugfs(void) (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); trace_create_file("buffer_size_kb", 0644, d_tracer, - &global_trace, &tracing_entries_fops); + (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops); trace_create_file("buffer_total_size_kb", 0444, d_tracer, &global_trace, &tracing_total_entries_fops); @@ -4957,6 +5070,10 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; + /* Only allocate trace_printk buffers if a trace_printk exists */ + if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) + trace_printk_init_buffers(); + /* To save memory, keep the ring buffer size to its minimum */ if (ring_buffer_expanded) ring_buf_size = trace_buf_size; @@ -4975,7 +5092,6 @@ __init static int tracer_alloc_buffers(void) WARN_ON(1); goto out_free_cpumask; } - global_trace.entries = ring_buffer_size(global_trace.buffer); if (global_trace.buffer_disabled) tracing_off(); @@ -4988,7 +5104,6 @@ __init static int tracer_alloc_buffers(void) ring_buffer_free(global_trace.buffer); goto out_free_cpumask; } - max_tr.entries = 1; #endif /* Allocate the first page for all buffers */ @@ -4997,6 +5112,12 @@ __init static int tracer_alloc_buffers(void) max_tr.data[i] = &per_cpu(max_tr_data, i); } + set_buffer_entries(&global_trace, + ring_buffer_size(global_trace.buffer, 0)); +#ifdef CONFIG_TRACER_MAX_TRACE + set_buffer_entries(&max_tr, 1); +#endif + trace_init_cmdlines(); register_tracer(&nop_trace); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f95d65da6db8..6c6f7933eede 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -131,6 +131,7 @@ struct trace_array_cpu { atomic_t disabled; void *buffer_page; /* ring buffer spare */ + unsigned long entries; unsigned long saved_latency; unsigned long critical_start; unsigned long critical_end; @@ -152,7 +153,6 @@ struct trace_array_cpu { */ struct trace_array { struct ring_buffer *buffer; - unsigned long entries; int cpu; int buffer_disabled; cycle_t time_start; @@ -826,6 +826,8 @@ extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; +void trace_printk_init_buffers(void); + #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ extern struct ftrace_event_call \ diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 6fd4ffd042f9..a9077c1b4ad3 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -51,6 +51,10 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) const char **iter; char *fmt; + /* allocate the trace_printk per cpu buffers */ + if (start != end) + trace_printk_init_buffers(); + mutex_lock(&btrace_mutex); for (iter = start; iter < end; iter++) { struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); |