diff options
-rw-r--r-- | Documentation/trace/tracedump.txt | 58 | ||||
-rw-r--r-- | include/linux/tracedump.h | 43 | ||||
-rw-r--r-- | kernel/trace/Kconfig | 24 | ||||
-rw-r--r-- | kernel/trace/Makefile | 1 | ||||
-rw-r--r-- | kernel/trace/tracedump.c | 682 |
5 files changed, 808 insertions, 0 deletions
diff --git a/Documentation/trace/tracedump.txt b/Documentation/trace/tracedump.txt new file mode 100644 index 000000000000..cba0decc3fc3 --- /dev/null +++ b/Documentation/trace/tracedump.txt @@ -0,0 +1,58 @@ + Tracedump + + Documentation written by Alon Farchy + +1. Overview +============ + +The tracedump module provides additional mechanisms to retrieve tracing data. +It can be used to retrieve traces after a kernel panic or while the system +is running in either binary format or plaintext. The dumped data is compressed +with zlib to conserve space. + +2. Configuration Options +======================== + +CONFIG_TRACEDUMP - enable the tracedump module. +CONFIG_TRACEDUMP_PANIC - dump to console on kernel panic +CONFIG_TRACEDUMP_PROCFS - add file /proc/tracedump for userspace access. + +3. Module Parameters +==================== + +format_ascii + + If 1, data will dump in human-readable format, ordered by time. + If 0, data will be dumped as raw pages from the ring buffer, + ordered by CPU, followed by the saved cmdlines so that the + raw data can be decoded. Default: 0 + +panic_size + + Maximum amount of compressed data to dump during a kernel panic + in kilobytes. This only applies if format_ascii == 1. In this case, + tracedump will compress the data, check the size, and if it is too big + toss out some data, compress again, etc, until the size is below + panic_size. Default: 512KB + +compress_level + + Determines the compression level that zlib will use. Available levels + are 0-9, with 0 as no compression and 9 as maximum compression. + Default: 9. + +4. Usage +======== + +If configured with CONFIG_TRACEDUMP_PROCFS, the tracing data can be pulled +by reading from /proc/tracedump. For example: + + # cat /proc/tracedump > my_tracedump + +Tracedump will surround the dump with a magic word (TRACEDUMP). Between the +magic words is the compressed data, which can be decompressed with a standard +zlib implementation. After decompression, if format_ascii == 1, then the +output should be readable. + +If format_ascii == 0, the output should be in binary form, delimited by +CPU_END. After the last CPU should be the saved cmdlines, delimited by |. diff --git a/include/linux/tracedump.h b/include/linux/tracedump.h new file mode 100644 index 000000000000..9e86946e354e --- /dev/null +++ b/include/linux/tracedump.h @@ -0,0 +1,43 @@ +/* + * include/linux/tracedump.h + * + * Copyright (c) 2011, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#ifndef _LINUX_KERNEL_TRACEDUMP_H +#define _LINUX_KERNEL_TRACEDUMP_H + +/* tracedump + * This module provides additional mechanisms for retreiving tracing data. + * For details on configurations, parameters and usage, see tracedump.txt. + */ + +#define TD_NO_PRINT 0 +#define TD_PRINT_CONSOLE 1 +#define TD_PRINT_USER 2 + +/* Dump the tracer to console */ +int tracedump_dump(size_t max_out); + +/* Dumping functions */ +int tracedump_init(void); +ssize_t tracedump_all(int print_to); +ssize_t tracedump_next(size_t max_out, int print_to); +int tracedump_reset(void); +int tracedump_deinit(void); + +#endif /* _LINUX_KERNEL_TRACEDUMP_H */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1c8ed74dcaf0..fd101e6f0190 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -582,6 +582,30 @@ config TRACELEVEL will automatically be enabled on kernel boot, and users can change the the trace level in a kernel parameter. +config TRACEDUMP + bool "Dumping functionality for ftrace" + depends on FUNCTION_TRACER + help + This option adds functionality to dump tracing data in several forms + Data can be dumped in ascii form or as raw pages from the tracing + ring buffers, along with the saved cmdlines. This is specified by + the module parameter tracedump_ascii. Data will be compressed + using zlib. + +config TRACEDUMP_PANIC + bool "Tracedump to console on panic" + depends on TRACEDUMP + help + With this option, tracedump will automatically dump to the console + on a kernel panic. + +config TRACEDUMP_PROCFS + bool "Tracedump via proc file" + depends on TRACEDUMP + help + With this option, tracedump can be dumped from user space by reading + from /proc/tracedump. + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 45ef52cfb0b9..2ac813c96712 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -59,6 +59,7 @@ ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif obj-$(CONFIG_TRACELEVEL) += tracelevel.o +obj-$(CONFIG_TRACEDUMP) += tracedump.o obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o diff --git a/kernel/trace/tracedump.c b/kernel/trace/tracedump.c new file mode 100644 index 000000000000..8d9589faad82 --- /dev/null +++ b/kernel/trace/tracedump.c @@ -0,0 +1,682 @@ +/* + * kernel/trace/tracedump.c + * + * Copyright (c) 2011, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <linux/console.h> +#include <linux/cpumask.h> +#include <linux/init.h> +#include <linux/irqflags.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/mutex.h> +#include <linux/notifier.h> +#include <linux/proc_fs.h> +#include <linux/ring_buffer.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/string.h> +#include <linux/threads.h> +#include <linux/tracedump.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/zlib.h> + +#include "trace.h" +#include "trace_output.h" + +#define CPU_MAX (NR_CPUS-1) + +#define TRYM(fn, ...) do { \ + int try_error = (fn); \ + if (try_error < 0) { \ + printk(__VA_ARGS__); \ + return try_error; \ + } \ +} while (0) + +#define TRY(fn) TRYM(fn, TAG "Caught error from %s in %s\n", #fn, __func__) + +/* Stolen from printk.c */ +#define for_each_console(con) \ + for (con = console_drivers; con != NULL; con = con->next) + +#define TAG KERN_ERR "tracedump: " + +#define TD_MIN_CONSUME 2000 +#define TD_COMPRESS_CHUNK 0x8000 + +static DEFINE_MUTEX(tracedump_proc_lock); + +static const char MAGIC_NUMBER[9] = "TRACEDUMP"; +static const char CPU_DELIM[7] = "CPU_END"; +#define CMDLINE_DELIM "|" + +/* Type of output */ +static bool current_format; +static bool format_ascii; +module_param(format_ascii, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(format_ascii, "Dump ascii or raw data"); + +/* Max size of output */ +static uint panic_size = 0x80000; +module_param(panic_size, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(panic_size, "Max dump size during kernel panic (bytes)"); + +static uint compress_level = 9; +module_param(compress_level, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(compress_level, "Level of compression to use. [0-9]"); + +static char out_buf[TD_COMPRESS_CHUNK]; +static z_stream stream; +static int compress_done; +static int flush; + +static int old_trace_flags; + +static struct trace_iterator iter; +static struct pager_s { + struct trace_array *tr; + void *spare; + int cpu; + int len; + char __user *ubuf; +} pager; + +static char cmdline_buf[16+TASK_COMM_LEN]; + +static int print_to_console(const char *buf, size_t len) +{ + struct console *con; + + /* Stolen from printk.c */ + for_each_console(con) { + if ((con->flags & CON_ENABLED) && con->write && + (cpu_online(smp_processor_id()) || + (con->flags & CON_ANYTIME))) + con->write(con, buf, len); + } + return 0; +} + +static int print_to_user(const char *buf, size_t len) +{ + int size; + size = copy_to_user(pager.ubuf, buf, len); + if (size > 0) { + printk(TAG "Failed to copy to user %d bytes\n", size); + return -EINVAL; + } + return 0; +} + +static int print(const char *buf, size_t len, int print_to) +{ + if (print_to == TD_PRINT_CONSOLE) + TRY(print_to_console(buf, len)); + else if (print_to == TD_PRINT_USER) + TRY(print_to_user(buf, len)); + return 0; +} + +/* print_magic will print MAGIC_NUMBER using the + * print function selected by print_to. + */ +static inline ssize_t print_magic(int print_to) +{ + print(MAGIC_NUMBER, sizeof(MAGIC_NUMBER), print_to); + return sizeof(MAGIC_NUMBER); +} + +static int iter_init(void) +{ + int cpu; + + /* Make iter point to global ring buffer used in trace. */ + trace_init_global_iter(&iter); + + /* Disable tracing */ + for_each_tracing_cpu(cpu) { + atomic_inc(&iter.tr->data[cpu]->disabled); + } + + /* Save flags */ + old_trace_flags = trace_flags; + + /* Dont look at memory in panic mode. */ + trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + + /* Prepare ring buffer iter */ + for_each_tracing_cpu(cpu) { + iter.buffer_iter[cpu] = + ring_buffer_read_prepare(iter.tr->buffer, cpu); + } + ring_buffer_read_prepare_sync(); + for_each_tracing_cpu(cpu) { + ring_buffer_read_start(iter.buffer_iter[cpu]); + tracing_iter_reset(&iter, cpu); + } + return 0; +} + +/* iter_next gets the next entry in the ring buffer, ordered by time. + * If there are no more entries, returns 0. + */ +static ssize_t iter_next(void) +{ + /* Zero out the iterator's seq */ + memset(&iter.seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + + while (!trace_empty(&iter)) { + if (trace_find_next_entry_inc(&iter) == NULL) { + printk(TAG "trace_find_next_entry failed!\n"); + return -EINVAL; + } + + /* Copy the ring buffer data to iterator's seq */ + print_trace_line(&iter); + if (iter.seq.len != 0) + return iter.seq.len; + } + return 0; +} + +static int iter_deinit(void) +{ + int cpu; + /* Enable tracing */ + for_each_tracing_cpu(cpu) { + ring_buffer_read_finish(iter.buffer_iter[cpu]); + } + for_each_tracing_cpu(cpu) { + atomic_dec(&iter.tr->data[cpu]->disabled); + } + + /* Restore flags */ + trace_flags = old_trace_flags; + return 0; +} + +static int pager_init(void) +{ + int cpu; + + /* Need to do this to get a pointer to global_trace (iter.tr). + Lame, I know. */ + trace_init_global_iter(&iter); + + /* Turn off tracing */ + for_each_tracing_cpu(cpu) { + atomic_inc(&iter.tr->data[cpu]->disabled); + } + + memset(&pager, 0, sizeof(pager)); + pager.tr = iter.tr; + pager.len = TD_COMPRESS_CHUNK; + + return 0; +} + +/* pager_next_cpu moves the pager to the next cpu. + * Returns 0 if pager is done, else 1. + */ +static ssize_t pager_next_cpu(void) +{ + if (pager.cpu <= CPU_MAX) { + pager.cpu += 1; + return 1; + } + + return 0; +} + +/* pager_next gets the next page of data from the ring buffer + * of the current cpu. Returns page size or 0 if no more data. + */ +static ssize_t pager_next(void) +{ + int ret; + + if (pager.cpu > CPU_MAX) + return 0; + + if (!pager.spare) + pager.spare = ring_buffer_alloc_read_page(pager.tr->buffer); + if (!pager.spare) { + printk(TAG "ring_buffer_alloc_read_page failed!"); + return -ENOMEM; + } + + ret = ring_buffer_read_page(pager.tr->buffer, + &pager.spare, + pager.len, + pager.cpu, 0); + if (ret < 0) + return 0; + + return PAGE_SIZE; +} + +static int pager_deinit(void) +{ + int cpu; + if (pager.spare != NULL) + ring_buffer_free_read_page(pager.tr->buffer, pager.spare); + + for_each_tracing_cpu(cpu) { + atomic_dec(&iter.tr->data[cpu]->disabled); + } + return 0; +} + +/* cmdline_next gets the next saved cmdline from the trace and + * puts it in cmdline_buf. Returns the size of the cmdline, or 0 if empty. + * but will reset itself on a subsequent call. + */ +static ssize_t cmdline_next(void) +{ + static int pid; + ssize_t size = 0; + + if (pid >= PID_MAX_DEFAULT) + pid = -1; + + while (size == 0 && pid < PID_MAX_DEFAULT) { + pid++; + trace_find_cmdline(pid, cmdline_buf); + if (!strncmp(cmdline_buf, "<...>", 5)) + continue; + + sprintf(&cmdline_buf[strlen(cmdline_buf)], " %d" + CMDLINE_DELIM, pid); + size = strlen(cmdline_buf); + } + return size; +} + +/* comsume_events removes the first 'num' entries from the ring buffer. */ +static int consume_events(size_t num) +{ + TRY(iter_init()); + for (; num > 0 && !trace_empty(&iter); num--) { + trace_find_next_entry_inc(&iter); + ring_buffer_consume(iter.tr->buffer, iter.cpu, &iter.ts, + &iter.lost_events); + } + TRY(iter_deinit()); + return 0; +} + +static int data_init(void) +{ + if (current_format) + TRY(iter_init()); + else + TRY(pager_init()); + return 0; +} + +/* data_next will figure out the right 'next' function to + * call and will select the right buffer to pass back + * to compress_next. + * + * iter_next should be used to get data entry-by-entry, ordered + * by time, which is what we need in order to convert it to ascii. + * + * pager_next will return a full page of raw data at a time, one + * CPU at a time. pager_next_cpu must be called to get the next CPU. + * cmdline_next will get the next saved cmdline + */ +static ssize_t data_next(const char **buf) +{ + ssize_t size; + + if (current_format) { + TRY(size = iter_next()); + *buf = iter.seq.buffer; + } else { + TRY(size = pager_next()); + *buf = pager.spare; + if (size == 0) { + if (pager_next_cpu()) { + size = sizeof(CPU_DELIM); + *buf = CPU_DELIM; + } else { + TRY(size = cmdline_next()); + *buf = cmdline_buf; + } + } + } + return size; +} + +static int data_deinit(void) +{ + if (current_format) + TRY(iter_deinit()); + else + TRY(pager_deinit()); + return 0; +} + +static int compress_init(void) +{ + int workspacesize, ret; + + compress_done = 0; + flush = Z_NO_FLUSH; + stream.data_type = current_format ? Z_ASCII : Z_BINARY; + workspacesize = zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL); + stream.workspace = vmalloc(workspacesize); + if (!stream.workspace) { + printk(TAG "Could not allocate " + "enough memory for zlib!\n"); + return -ENOMEM; + } + memset(stream.workspace, 0, workspacesize); + + ret = zlib_deflateInit(&stream, compress_level); + if (ret != Z_OK) { + printk(TAG "%s\n", stream.msg); + return ret; + } + stream.avail_in = 0; + stream.avail_out = 0; + TRY(data_init()); + return 0; +} + +/* compress_next will compress up to min(max_out, TD_COMPRESS_CHUNK) bytes + * of data into the output buffer. It gets the data by calling data_next. + * It will return the most data it possibly can. If it returns 0, then + * there is no more data. + * + * By the way that zlib works, each call to zlib_deflate will possibly + * consume up to avail_in bytes from next_in, and will fill up to + * avail_out bytes in next_out. Once flush == Z_FINISH, it can not take + * any more input. It will output until it is finished, and will return + * Z_STREAM_END. + */ +static ssize_t compress_next(size_t max_out) +{ + ssize_t ret; + max_out = min(max_out, (size_t)TD_COMPRESS_CHUNK); + stream.next_out = out_buf; + stream.avail_out = max_out; + while (stream.avail_out > 0 && !compress_done) { + if (stream.avail_in == 0 && flush != Z_FINISH) { + TRY(stream.avail_in = + data_next((const char **)&stream.next_in)); + flush = (stream.avail_in == 0) ? Z_FINISH : Z_NO_FLUSH; + } + if (stream.next_in != NULL) { + TRYM((ret = zlib_deflate(&stream, flush)), + "zlib: %s\n", stream.msg); + compress_done = (ret == Z_STREAM_END); + } + } + ret = max_out - stream.avail_out; + return ret; +} + +static int compress_deinit(void) +{ + TRY(data_deinit()); + + zlib_deflateEnd(&stream); + vfree(stream.workspace); + + /* TODO: remove */ + printk(TAG "Total in: %ld\n", stream.total_in); + printk(TAG "Total out: %ld\n", stream.total_out); + return stream.total_out; +} + +static int compress_reset(void) +{ + TRY(compress_deinit()); + TRY(compress_init()); + return 0; +} + +/* tracedump_init initializes all tracedump components. + * Call this before tracedump_next + */ +int tracedump_init(void) +{ + TRY(compress_init()); + return 0; +} + +/* tracedump_next will print up to max_out data from the tracing ring + * buffers using the print function selected by print_to. The data is + * compressed using zlib. + * + * The output type of the data is specified by the format_ascii module + * parameter. If format_ascii == 1, human-readable data will be output. + * Otherwise, it will output raw data from the ring buffer in cpu order, + * followed by the saved_cmdlines data. + */ +ssize_t tracedump_next(size_t max_out, int print_to) +{ + ssize_t size; + TRY(size = compress_next(max_out)); + print(out_buf, size, print_to); + return size; +} + +/* tracedump_all will print all data in the tracing ring buffers using + * the print function selected by print_to. The data is compressed using + * zlib, and is surrounded by MAGIC_NUMBER. + * + * The output type of the data is specified by the format_ascii module + * parameter. If format_ascii == 1, human-readable data will be output. + * Otherwise, it will output raw data from the ring buffer in cpu order, + * followed by the saved_cmdlines data. + */ +ssize_t tracedump_all(int print_to) +{ + ssize_t ret, size = 0; + TRY(size += print_magic(print_to)); + + do { + /* Here the size used doesn't really matter, + * since we're dumping everything. */ + TRY(ret = tracedump_next(0xFFFFFFFF, print_to)); + size += ret; + } while (ret > 0); + + TRY(size += print_magic(print_to)); + + return size; +} + +/* tracedump_deinit deinitializes all tracedump components. + * This must be called, even on error. + */ +int tracedump_deinit(void) +{ + TRY(compress_deinit()); + return 0; +} + +/* tracedump_reset reinitializes all tracedump components. */ +int tracedump_reset(void) +{ + TRY(compress_reset()); + return 0; +} + + + +/* tracedump_open opens the tracedump file for reading. */ +static int tracedump_open(struct inode *inode, struct file *file) +{ + int ret; + mutex_lock(&tracedump_proc_lock); + current_format = format_ascii; + ret = tracedump_init(); + if (ret < 0) + goto err; + + ret = nonseekable_open(inode, file); + if (ret < 0) + goto err; + return ret; + +err: + mutex_unlock(&tracedump_proc_lock); + return ret; +} + +/* tracedump_read will reads data from tracedump_next and prints + * it to userspace. It will surround the data with MAGIC_NUMBER. + */ +static ssize_t tracedump_read(struct file *file, char __user *buf, + size_t len, loff_t *offset) +{ + static int done; + ssize_t size = 0; + + pager.ubuf = buf; + + if (*offset == 0) { + done = 0; + TRY(size = print_magic(TD_PRINT_USER)); + } else if (!done) { + TRY(size = tracedump_next(len, TD_PRINT_USER)); + if (size == 0) { + TRY(size = print_magic(TD_PRINT_USER)); + done = 1; + } + } + + *offset += size; + + return size; +} + +static int tracedump_release(struct inode *inode, struct file *file) +{ + int ret; + ret = tracedump_deinit(); + mutex_unlock(&tracedump_proc_lock); + return ret; +} + +/* tracedump_dump dumps all tracing data from the tracing ring buffers + * to all consoles. For details about the output format, see + * tracedump_all. + + * At most max_out bytes are dumped. To accomplish this, + * tracedump_dump calls tracedump_all several times without writing the data, + * each time tossing out old data until it reaches its goal. + * + * Note: dumping raw pages currently does NOT follow the size limit. + */ + +int tracedump_dump(size_t max_out) +{ + ssize_t size; + size_t consume; + + printk(TAG "\n"); + + tracedump_init(); + + if (format_ascii) { + size = tracedump_all(TD_NO_PRINT); + if (size < 0) { + printk(TAG "failed to dump\n"); + goto out; + } + while (size > max_out) { + TRY(tracedump_deinit()); + /* Events take more or less 60 ascii bytes each, + not counting compression */ + consume = TD_MIN_CONSUME + (size - max_out) / + (60 / (compress_level + 1)); + TRY(consume_events(consume)); + TRY(tracedump_init()); + size = tracedump_all(TD_NO_PRINT); + if (size < 0) { + printk(TAG "failed to dump\n"); + goto out; + } + } + + TRY(tracedump_reset()); + } + size = tracedump_all(TD_PRINT_CONSOLE); + if (size < 0) { + printk(TAG "failed to dump\n"); + goto out; + } + +out: + tracedump_deinit(); + printk(KERN_INFO "\n" TAG " end\n"); + return size; +} + +static const struct file_operations tracedump_fops = { + .owner = THIS_MODULE, + .open = tracedump_open, + .read = tracedump_read, + .release = tracedump_release, +}; + +#ifdef CONFIG_TRACEDUMP_PANIC +static int tracedump_panic_handler(struct notifier_block *this, + unsigned long event, void *unused) +{ + tracedump_dump(panic_size); + return 0; +} + +static struct notifier_block tracedump_panic_notifier = { + .notifier_call = tracedump_panic_handler, + .next = NULL, + .priority = 150 /* priority: INT_MAX >= x >= 0 */ +}; +#endif + +static int __init tracedump_initcall(void) +{ +#ifdef CONFIG_TRACEDUMP_PROCFS + struct proc_dir_entry *entry; + + /* Create a procfs file for easy dumping */ + entry = create_proc_entry("tracedump", S_IFREG | S_IRUGO, NULL); + if (!entry) + printk(TAG "failed to create proc entry\n"); + else + entry->proc_fops = &tracedump_fops; +#endif + +#ifdef CONFIG_TRACEDUMP_PANIC + /* Automatically dump to console on a kernel panic */ + atomic_notifier_chain_register(&panic_notifier_list, + &tracedump_panic_notifier); +#endif + return 0; +} + +early_initcall(tracedump_initcall); |