From 958de668197651bbf2b4b9528f204ab5a0f1af65 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Oct 2019 21:07:31 +0200 Subject: module: Remove set_all_modules_text_*() Now that there are no users of set_all_modules_text_*() left, remove it. While it appears nds32 uses it, it does not have STRICT_MODULE_RWX and therefore ends up with the NOP stubs. Tested-by: Alexei Starovoitov Tested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Greentime Hu Cc: H. Peter Anvin Cc: Jessica Yu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Chen Link: https://lkml.kernel.org/r/20191111132458.284298307@infradead.org Signed-off-by: Ingo Molnar --- kernel/module.c | 43 ------------------------------------------- 1 file changed, 43 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index acf7962936c4..5cd9bed595f4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2029,49 +2029,6 @@ static void module_enable_nx(const struct module *mod) frob_writable_data(&mod->init_layout, set_memory_nx); } -/* Iterate through all modules and set each module's text as RW */ -void set_all_modules_text_rw(void) -{ - struct module *mod; - - if (!rodata_enabled) - return; - - mutex_lock(&module_mutex); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - - frob_text(&mod->core_layout, set_memory_rw); - frob_text(&mod->init_layout, set_memory_rw); - } - mutex_unlock(&module_mutex); -} - -/* Iterate through all modules and set each module's text as RO */ -void set_all_modules_text_ro(void) -{ - struct module *mod; - - if (!rodata_enabled) - return; - - mutex_lock(&module_mutex); - list_for_each_entry_rcu(mod, &modules, list) { - /* - * Ignore going modules since it's possible that ro - * protection has already been disabled, otherwise we'll - * run into protection faults at module deallocation. - */ - if (mod->state == MODULE_STATE_UNFORMED || - mod->state == MODULE_STATE_GOING) - continue; - - frob_text(&mod->core_layout, set_memory_ro); - frob_text(&mod->init_layout, set_memory_ro); - } - mutex_unlock(&module_mutex); -} #else /* !CONFIG_STRICT_MODULE_RWX */ static void module_enable_nx(const struct module *mod) { } #endif /* CONFIG_STRICT_MODULE_RWX */ -- cgit v1.2.3 From 04ae87a52074e2d448fc66143f1bd2c7d694d2b9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Oct 2019 22:26:59 +0200 Subject: ftrace: Rework event_create_dir() Rework event_create_dir() to use an array of static data instead of function pointers where possible. The problem is that it would call the function pointer on module load before parse_args(), possibly even before jump_labels were initialized. Luckily the generated functions don't use jump_labels but it still seems fragile. It also gets in the way of changing when we make the module map executable. The generated function are basically calling trace_define_field() with a bunch of static arguments. So instead of a function, capture these arguments in a static array, avoiding the function call. Now there are a number of cases where the fields are dynamic (syscall arguments, kprobes and uprobes), in which case a static array does not work, for these we preserve the function call. Luckily all these cases are not related to modules and so we can retain the function call for them. Also fix up all broken tracepoint definitions that now generate a compile error. Tested-by: Alexei Starovoitov Tested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191111132458.342979914@infradead.org Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 31 ++++++------ kernel/trace/trace_entries.h | 66 +++++++----------------- kernel/trace/trace_events.c | 20 +++++++- kernel/trace/trace_events_hist.c | 8 ++- kernel/trace/trace_export.c | 106 ++++++++++++++------------------------- kernel/trace/trace_kprobe.c | 16 +++++- kernel/trace/trace_syscalls.c | 50 ++++++++---------- kernel/trace/trace_uprobe.c | 9 +++- 8 files changed, 138 insertions(+), 168 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d685c61085c0..298a7cacf146 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -49,6 +49,9 @@ enum trace_type { #undef __field #define __field(type, item) type item; +#undef __field_fn +#define __field_fn(type, item) type item; + #undef __field_struct #define __field_struct(type, item) __field(type, item) @@ -68,26 +71,22 @@ enum trace_type { #define F_STRUCT(args...) args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ struct struct_name { \ struct trace_entry ent; \ tstruct \ } #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter) +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ - filter, regfn) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) #undef FTRACE_ENTRY_PACKED -#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \ - filter) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) __packed +#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) __packed #include "trace_entries.h" @@ -1899,17 +1898,15 @@ extern void tracing_log_err(struct trace_array *tr, #define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str)) #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ extern struct trace_event_call \ __aligned(4) event_##call; #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ - FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #undef FTRACE_ENTRY_PACKED -#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \ - FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) +#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index fc8e97328e54..3e9d81608284 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -61,15 +61,13 @@ FTRACE_ENTRY_REG(function, ftrace_entry, TRACE_FN, F_STRUCT( - __field( unsigned long, ip ) - __field( unsigned long, parent_ip ) + __field_fn( unsigned long, ip ) + __field_fn( unsigned long, parent_ip ) ), F_printk(" %ps <-- %ps", (void *)__entry->ip, (void *)__entry->parent_ip), - FILTER_TRACE_FN, - perf_ftrace_event_register ); @@ -84,9 +82,7 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, __field_desc( int, graph_ent, depth ) ), - F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth), - - FILTER_OTHER + F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth) ); /* Function return entry */ @@ -97,18 +93,16 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, F_STRUCT( __field_struct( struct ftrace_graph_ret, ret ) __field_desc( unsigned long, ret, func ) + __field_desc( unsigned long, ret, overrun ) __field_desc( unsigned long long, ret, calltime) __field_desc( unsigned long long, ret, rettime ) - __field_desc( unsigned long, ret, overrun ) __field_desc( int, ret, depth ) ), F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", (void *)__entry->func, __entry->depth, __entry->calltime, __entry->rettime, - __entry->depth), - - FILTER_OTHER + __entry->depth) ); /* @@ -137,9 +131,7 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry, F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu), - - FILTER_OTHER + __entry->next_cpu) ); /* @@ -157,9 +149,7 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu), - - FILTER_OTHER + __entry->next_cpu) ); /* @@ -183,9 +173,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry, (void *)__entry->caller[0], (void *)__entry->caller[1], (void *)__entry->caller[2], (void *)__entry->caller[3], (void *)__entry->caller[4], (void *)__entry->caller[5], - (void *)__entry->caller[6], (void *)__entry->caller[7]), - - FILTER_OTHER + (void *)__entry->caller[6], (void *)__entry->caller[7]) ); FTRACE_ENTRY(user_stack, userstack_entry, @@ -203,9 +191,7 @@ FTRACE_ENTRY(user_stack, userstack_entry, (void *)__entry->caller[0], (void *)__entry->caller[1], (void *)__entry->caller[2], (void *)__entry->caller[3], (void *)__entry->caller[4], (void *)__entry->caller[5], - (void *)__entry->caller[6], (void *)__entry->caller[7]), - - FILTER_OTHER + (void *)__entry->caller[6], (void *)__entry->caller[7]) ); /* @@ -222,9 +208,7 @@ FTRACE_ENTRY(bprint, bprint_entry, ), F_printk("%ps: %s", - (void *)__entry->ip, __entry->fmt), - - FILTER_OTHER + (void *)__entry->ip, __entry->fmt) ); FTRACE_ENTRY_REG(print, print_entry, @@ -239,8 +223,6 @@ FTRACE_ENTRY_REG(print, print_entry, F_printk("%ps: %s", (void *)__entry->ip, __entry->buf), - FILTER_OTHER, - ftrace_event_register ); @@ -254,9 +236,7 @@ FTRACE_ENTRY(raw_data, raw_data_entry, ), F_printk("id:%04x %08x", - __entry->id, (int)__entry->buf[0]), - - FILTER_OTHER + __entry->id, (int)__entry->buf[0]) ); FTRACE_ENTRY(bputs, bputs_entry, @@ -269,9 +249,7 @@ FTRACE_ENTRY(bputs, bputs_entry, ), F_printk("%ps: %s", - (void *)__entry->ip, __entry->str), - - FILTER_OTHER + (void *)__entry->ip, __entry->str) ); FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, @@ -283,16 +261,14 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, __field_desc( resource_size_t, rw, phys ) __field_desc( unsigned long, rw, value ) __field_desc( unsigned long, rw, pc ) - __field_desc( int, rw, map_id ) + __field_desc( int, rw, map_id ) __field_desc( unsigned char, rw, opcode ) __field_desc( unsigned char, rw, width ) ), F_printk("%lx %lx %lx %d %x %x", (unsigned long)__entry->phys, __entry->value, __entry->pc, - __entry->map_id, __entry->opcode, __entry->width), - - FILTER_OTHER + __entry->map_id, __entry->opcode, __entry->width) ); FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, @@ -304,15 +280,13 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, __field_desc( resource_size_t, map, phys ) __field_desc( unsigned long, map, virt ) __field_desc( unsigned long, map, len ) - __field_desc( int, map, map_id ) + __field_desc( int, map, map_id ) __field_desc( unsigned char, map, opcode ) ), F_printk("%lx %lx %lx %d %x", (unsigned long)__entry->phys, __entry->virt, __entry->len, - __entry->map_id, __entry->opcode), - - FILTER_OTHER + __entry->map_id, __entry->opcode) ); @@ -334,9 +308,7 @@ FTRACE_ENTRY(branch, trace_branch, F_printk("%u:%s:%s (%u)%s", __entry->line, __entry->func, __entry->file, __entry->correct, - __entry->constant ? " CONSTANT" : ""), - - FILTER_OTHER + __entry->constant ? " CONSTANT" : "") ); @@ -362,7 +334,5 @@ FTRACE_ENTRY(hwlat, hwlat_entry, __entry->duration, __entry->outer_duration, __entry->nmi_total_ts, - __entry->nmi_count), - - FILTER_OTHER + __entry->nmi_count) ); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fba87d10f0c1..5ab10c3dce78 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -24,6 +24,7 @@ #include #include +#include #include @@ -1990,7 +1991,24 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) */ head = trace_get_fields(call); if (list_empty(head)) { - ret = call->class->define_fields(call); + struct trace_event_fields *field = call->class->fields_array; + unsigned int offset = sizeof(struct trace_entry); + + for (; field->type; field++) { + if (field->type == TRACE_FUNCTION_TYPE) { + ret = field->define_fields(call); + break; + } + + offset = ALIGN(offset, field->align); + ret = trace_define_field(call, field->type, field->name, + offset, field->size, + field->is_signed, field->filter_type); + if (ret) + break; + + offset += field->size; + } if (ret < 0) { pr_warn("Could not initialize trace point events/%s\n", name); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7482a1466ebf..3bec92c020f1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1135,6 +1135,12 @@ static struct synth_event *find_synth_event(const char *name) return NULL; } +static struct trace_event_fields synth_event_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = synth_event_define_fields }, + {} +}; + static int register_synth_event(struct synth_event *event) { struct trace_event_call *call = &event->call; @@ -1156,7 +1162,7 @@ static int register_synth_event(struct synth_event *event) INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &synth_event_funcs; - call->class->define_fields = synth_event_define_fields; + call->class->fields_array = synth_event_fields_array; ret = register_trace_event(&call->event); if (!ret) { diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 45630a76ed3a..6d64c1c19fd5 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -29,10 +29,8 @@ static int ftrace_event_register(struct trace_event_call *call, * function and thus become accesible via perf. */ #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ - filter, regfn) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) /* not needed for this file */ #undef __field_struct @@ -41,6 +39,9 @@ static int ftrace_event_register(struct trace_event_call *call, #undef __field #define __field(type, item) type item; +#undef __field_fn +#define __field_fn(type, item) type item; + #undef __field_desc #define __field_desc(type, container, item) type item; @@ -60,7 +61,7 @@ static int ftrace_event_register(struct trace_event_call *call, #define F_printk(fmt, args...) fmt, args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ struct ____ftrace_##name { \ tstruct \ }; \ @@ -73,76 +74,46 @@ static void __always_unused ____ftrace_check_##name(void) \ } #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" +#undef __field_ext +#define __field_ext(_type, _item, _filter_type) { \ + .type = #_type, .name = #_item, \ + .size = sizeof(_type), .align = __alignof__(_type), \ + is_signed_type(_type), .filter_type = _filter_type }, + #undef __field -#define __field(type, item) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), \ - is_signed_type(type), filter_type); \ - if (ret) \ - return ret; +#define __field(_type, _item) __field_ext(_type, _item, FILTER_OTHER) + +#undef __field_fn +#define __field_fn(_type, _item) __field_ext(_type, _item, FILTER_TRACE_FN) #undef __field_desc -#define __field_desc(type, container, item) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), \ - container.item), \ - sizeof(field.container.item), \ - is_signed_type(type), filter_type); \ - if (ret) \ - return ret; +#define __field_desc(_type, _container, _item) __field_ext(_type, _item, FILTER_OTHER) #undef __array -#define __array(type, item, len) \ - do { \ - char *type_str = #type"["__stringify(len)"]"; \ - BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - ret = trace_define_field(event_call, type_str, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), \ - is_signed_type(type), filter_type); \ - if (ret) \ - return ret; \ - } while (0); +#define __array(_type, _item, _len) { \ + .type = #_type"["__stringify(_len)"]", .name = #_item, \ + .size = sizeof(_type[_len]), .align = __alignof__(_type), \ + is_signed_type(_type), .filter_type = FILTER_OTHER }, #undef __array_desc -#define __array_desc(type, container, item, len) \ - BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - ret = trace_define_field(event_call, #type "[" #len "]", #item, \ - offsetof(typeof(field), \ - container.item), \ - sizeof(field.container.item), \ - is_signed_type(type), filter_type); \ - if (ret) \ - return ret; +#define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len) #undef __dynamic_array -#define __dynamic_array(type, item) \ - ret = trace_define_field(event_call, #type "[]", #item, \ - offsetof(typeof(field), item), \ - 0, is_signed_type(type), filter_type);\ - if (ret) \ - return ret; +#define __dynamic_array(_type, _item) { \ + .type = #_type "[]", .name = #_item, \ + .size = 0, .align = __alignof__(_type), \ + is_signed_type(_type), .filter_type = FILTER_OTHER }, #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ -static int __init \ -ftrace_define_fields_##name(struct trace_event_call *event_call) \ -{ \ - struct struct_name field; \ - int ret; \ - int filter_type = filter; \ - \ - tstruct; \ - \ - return ret; \ -} +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +static struct trace_event_fields ftrace_event_fields_##name[] = { \ + tstruct \ + {} }; #include "trace_entries.h" @@ -152,6 +123,9 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \ #undef __field #define __field(type, item) +#undef __field_fn +#define __field_fn(type, item) + #undef __field_desc #define __field_desc(type, container, item) @@ -168,12 +142,10 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \ #define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args) #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ - regfn) \ - \ +#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, regfn) \ struct trace_event_class __refdata event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ - .define_fields = ftrace_define_fields_##call, \ + .fields_array = ftrace_event_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ .reg = regfn, \ }; \ @@ -191,9 +163,9 @@ struct trace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \ +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ FTRACE_ENTRY_REG(call, struct_name, etype, \ - PARAMS(tstruct), PARAMS(print), filter, NULL) + PARAMS(tstruct), PARAMS(print), NULL) bool ftrace_event_is_function(struct trace_event_call *call) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1552a95c743b..66e0a8ff1c01 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1534,16 +1534,28 @@ static struct trace_event_functions kprobe_funcs = { .trace = print_kprobe_event }; +static struct trace_event_fields kretprobe_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = kretprobe_event_define_fields }, + {} +}; + +static struct trace_event_fields kprobe_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = kprobe_event_define_fields }, + {} +}; + static inline void init_trace_event_call(struct trace_kprobe *tk) { struct trace_event_call *call = trace_probe_event_call(&tk->tp); if (trace_kprobe_is_return(tk)) { call->event.funcs = &kretprobe_funcs; - call->class->define_fields = kretprobe_event_define_fields; + call->class->fields_array = kretprobe_fields_array; } else { call->event.funcs = &kprobe_funcs; - call->class->define_fields = kprobe_event_define_fields; + call->class->fields_array = kprobe_fields_array; } call->flags = TRACE_EVENT_FL_KPROBE; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index fa8fbff736d6..53935259f701 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -198,11 +198,10 @@ print_syscall_exit(struct trace_iterator *iter, int flags, extern char *__bad_type_size(void); -#define SYSCALL_FIELD(type, field, name) \ - sizeof(type) != sizeof(trace.field) ? \ - __bad_type_size() : \ - #type, #name, offsetof(typeof(trace), field), \ - sizeof(trace.field), is_signed_type(type) +#define SYSCALL_FIELD(_type, _name) { \ + .type = #_type, .name = #_name, \ + .size = sizeof(_type), .align = __alignof__(_type), \ + .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER } static int __init __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) @@ -269,42 +268,22 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) { struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; - int ret; - int i; int offset = offsetof(typeof(trace), args); - - ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr), - FILTER_OTHER); - if (ret) - return ret; + int ret, i; for (i = 0; i < meta->nb_args; i++) { ret = trace_define_field(call, meta->types[i], meta->args[i], offset, sizeof(unsigned long), 0, FILTER_OTHER); + if (ret) + break; offset += sizeof(unsigned long); } return ret; } -static int __init syscall_exit_define_fields(struct trace_event_call *call) -{ - struct syscall_trace_exit trace; - int ret; - - ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr), - FILTER_OTHER); - if (ret) - return ret; - - ret = trace_define_field(call, SYSCALL_FIELD(long, ret, ret), - FILTER_OTHER); - - return ret; -} - static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { struct trace_array *tr = data; @@ -502,6 +481,13 @@ static int __init init_syscall_trace(struct trace_event_call *call) return id; } +static struct trace_event_fields __refdata syscall_enter_fields_array[] = { + SYSCALL_FIELD(int, __syscall_nr), + { .type = TRACE_FUNCTION_TYPE, + .define_fields = syscall_enter_define_fields }, + {} +}; + struct trace_event_functions enter_syscall_print_funcs = { .trace = print_syscall_enter, }; @@ -513,7 +499,7 @@ struct trace_event_functions exit_syscall_print_funcs = { struct trace_event_class __refdata event_class_syscall_enter = { .system = "syscalls", .reg = syscall_enter_register, - .define_fields = syscall_enter_define_fields, + .fields_array = syscall_enter_fields_array, .get_fields = syscall_get_enter_fields, .raw_init = init_syscall_trace, }; @@ -521,7 +507,11 @@ struct trace_event_class __refdata event_class_syscall_enter = { struct trace_event_class __refdata event_class_syscall_exit = { .system = "syscalls", .reg = syscall_exit_register, - .define_fields = syscall_exit_define_fields, + .fields_array = (struct trace_event_fields[]){ + SYSCALL_FIELD(int, __syscall_nr), + SYSCALL_FIELD(long, ret), + {} + }, .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), .raw_init = init_syscall_trace, }; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 352073d36585..476a382f1f1b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1507,12 +1507,17 @@ static struct trace_event_functions uprobe_funcs = { .trace = print_uprobe_event }; +static struct trace_event_fields uprobe_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = uprobe_event_define_fields }, + {} +}; + static inline void init_trace_event_call(struct trace_uprobe *tu) { struct trace_event_call *call = trace_probe_event_call(&tu->tp); - call->event.funcs = &uprobe_funcs; - call->class->define_fields = uprobe_event_define_fields; + call->class->fields_array = uprobe_fields_array; call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY; call->class->reg = trace_uprobe_register; -- cgit v1.2.3 From f66c0447cca1281116224d474cdb37d6a18e4b5b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 27 Nov 2019 14:57:04 +0900 Subject: kprobes: Set unoptimized flag after unoptimizing code Set the unoptimized flag after confirming the code is completely unoptimized. Without this fix, when a kprobe hits the intermediate modified instruction (the first byte is replaced by an INT3, but later bytes can still be a jump address operand) while unoptimizing, it can return to the middle byte of the modified code, which causes an invalid instruction exception in the kernel. Usually, this is a rare case, but if we put a probe on the function call while text patching, it always causes a kernel panic as below: # echo p text_poke+5 > kprobe_events # echo 1 > events/kprobes/enable # echo 0 > events/kprobes/enable invalid opcode: 0000 [#1] PREEMPT SMP PTI RIP: 0010:text_poke+0x9/0x50 Call Trace: arch_unoptimize_kprobe+0x22/0x28 arch_unoptimize_kprobes+0x39/0x87 kprobe_optimizer+0x6e/0x290 process_one_work+0x2a0/0x610 worker_thread+0x28/0x3d0 ? process_one_work+0x610/0x610 kthread+0x10d/0x130 ? kthread_park+0x80/0x80 ret_from_fork+0x3a/0x50 text_poke() is used for patching the code in optprobes. This can happen even if we blacklist text_poke() and other functions, because there is a small time window during which we show the intermediate code to other CPUs. [ mingo: Edited the changelog. ] Tested-by: Alexei Starovoitov Signed-off-by: Masami Hiramatsu Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: bristot@redhat.com Fixes: 6274de4984a6 ("kprobes: Support delayed unoptimizing") Link: https://lkml.kernel.org/r/157483422375.25881.13508326028469515760.stgit@devnote2 Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 53534aa258a6..34e28b236d68 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -510,6 +510,8 @@ static void do_unoptimize_kprobes(void) arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); /* Loop free_list for disarming */ list_for_each_entry_safe(op, tmp, &freeing_list, list) { + /* Switching from detour code to origin */ + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; /* Disarm probes if marked disabled */ if (kprobe_disabled(&op->kp)) arch_disarm_kprobe(&op->kp); @@ -649,6 +651,7 @@ static void force_unoptimize_kprobe(struct optimized_kprobe *op) { lockdep_assert_cpus_held(); arch_unoptimize_kprobe(op); + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; if (kprobe_disabled(&op->kp)) arch_disarm_kprobe(&op->kp); } @@ -676,7 +679,6 @@ static void unoptimize_kprobe(struct kprobe *p, bool force) return; } - op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; if (!list_empty(&op->list)) { /* Dequeue from the optimization queue */ list_del_init(&op->list); -- cgit v1.2.3 From c5105d764e0214bcc4c6d40d7ba231d01b2e9dda Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 27 Nov 2019 16:37:28 +0800 Subject: sched/clock: Use static_branch_likely() with sched_clock_running sched_clock_running is enabled early at bootup stage and never disabled. So hint that to the compiler by using static_branch_likely() rather than static_branch_unlikely(). The branch probability mis-annotation was introduced in the original commit that converted the plain sched_clock_running flag to a static key: 46457ea464f5 ("sched/clock: Use static key for sched_clock_running") Steve further notes: | Looks like the confusion was the moving of the "!": | | - if (unlikely(!sched_clock_running)) | + if (!static_branch_unlikely(&sched_clock_running)) | | Where, it was unlikely that !sched_clock_running would be true, but | because the "!" was moved outside the "unlikely()" it makes the test | "likely()". That is, if we added an intermediate step, it would have | been: | | if (!likely(sched_clock_running)) | | which would have prevented the mistake that this patch fixes. [ mingo: Edited the changelog. ] Signed-off-by: Zhenzhong Duan Reviewed-by: Steven Rostedt (VMware) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: mgorman@suse.de Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/1574843848-26825-1-git-send-email-zhenzhong.duan@oracle.com Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 1152259a4ca0..12bca64dff73 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -370,7 +370,7 @@ u64 sched_clock_cpu(int cpu) if (sched_clock_stable()) return sched_clock() + __sched_clock_offset; - if (!static_branch_unlikely(&sched_clock_running)) + if (!static_branch_likely(&sched_clock_running)) return sched_clock(); preempt_disable_notrace(); @@ -393,7 +393,7 @@ void sched_clock_tick(void) if (sched_clock_stable()) return; - if (!static_branch_unlikely(&sched_clock_running)) + if (!static_branch_likely(&sched_clock_running)) return; lockdep_assert_irqs_disabled(); @@ -460,7 +460,7 @@ void __init sched_clock_init(void) u64 sched_clock_cpu(int cpu) { - if (!static_branch_unlikely(&sched_clock_running)) + if (!static_branch_likely(&sched_clock_running)) return 0; return sched_clock(); -- cgit v1.2.3 From 1b40cd56f3bcffcfedb43bc30bd431b52240fb3b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 15 Oct 2019 21:18:18 +0200 Subject: sched/rt, locking: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Switch the Kconfig dependency to use CONFIG_PREEMPTION. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Link: https://lore.kernel.org/r/20191015191821.11479-32-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/Kconfig.locks | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index e0852dc333ac..3de8fd11873b 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -101,7 +101,7 @@ config UNINLINE_SPIN_UNLOCK # unlock and unlock_irq functions are inlined when: # - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y # or -# - DEBUG_SPINLOCK=n and PREEMPT=n +# - DEBUG_SPINLOCK=n and PREEMPTION=n # # unlock_bh and unlock_irqrestore functions are inlined when: # - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y @@ -139,7 +139,7 @@ config INLINE_SPIN_UNLOCK_BH config INLINE_SPIN_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ + depends on !PREEMPTION || ARCH_INLINE_SPIN_UNLOCK_IRQ config INLINE_SPIN_UNLOCK_IRQRESTORE def_bool y @@ -168,7 +168,7 @@ config INLINE_READ_LOCK_IRQSAVE config INLINE_READ_UNLOCK def_bool y - depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK + depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK config INLINE_READ_UNLOCK_BH def_bool y @@ -176,7 +176,7 @@ config INLINE_READ_UNLOCK_BH config INLINE_READ_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ + depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK_IRQ config INLINE_READ_UNLOCK_IRQRESTORE def_bool y @@ -205,7 +205,7 @@ config INLINE_WRITE_LOCK_IRQSAVE config INLINE_WRITE_UNLOCK def_bool y - depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK + depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK config INLINE_WRITE_UNLOCK_BH def_bool y @@ -213,7 +213,7 @@ config INLINE_WRITE_UNLOCK_BH config INLINE_WRITE_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ + depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK_IRQ config INLINE_WRITE_UNLOCK_IRQRESTORE def_bool y -- cgit v1.2.3 From 025f50f3866486a5278afa91f0d3b6b780141050 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 15 Oct 2019 21:18:21 +0200 Subject: sched/rt, workqueue: Use PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Update the comment to use PREEMPTION because it is true for both preemption models. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Tejun Heo Link: https://lore.kernel.org/r/20191015191821.11479-35-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bc88fd939f4e..bf57dc717b38 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2280,7 +2280,7 @@ __acquires(&pool->lock) } /* - * The following prevents a kworker from hogging CPU on !PREEMPT + * The following prevents a kworker from hogging CPU on !PREEMPTION * kernels, where a requeueing work item waiting for something to * happen could deadlock with stop_machine as such work item could * indefinitely requeue itself while all other CPUs are trapped in -- cgit v1.2.3 From ce623f89872df4253719be71531116751eeab85f Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 7 Dec 2019 01:13:27 +1100 Subject: nsfs: clean-up ns_get_path() signature to return int ns_get_path() and ns_get_path_cb() only ever return either NULL or an ERR_PTR. It is far more idiomatic to simply return an integer, and it makes all of the callers of ns_get_path() more straightforward to read. Fixes: e149ed2b805f ("take the targets of /proc/*/ns/* symlinks to separate fs") Signed-off-by: Aleksa Sarai Signed-off-by: Al Viro --- kernel/bpf/offload.c | 12 ++++++------ kernel/events/core.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 5b9da0954a27..2c5dc6541ece 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -302,14 +302,14 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, struct inode *ns_inode; struct path ns_path; char __user *uinsns; - void *res; + int res; u32 ulen; res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); - if (IS_ERR(res)) { + if (res) { if (!info->ifindex) return -ENODEV; - return PTR_ERR(res); + return res; } down_read(&bpf_devs_lock); @@ -526,13 +526,13 @@ int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) }; struct inode *ns_inode; struct path ns_path; - void *res; + int res; res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args); - if (IS_ERR(res)) { + if (res) { if (!info->ifindex) return -ENODEV; - return PTR_ERR(res); + return res; } ns_inode = ns_path.dentry->d_inode; diff --git a/kernel/events/core.c b/kernel/events/core.c index 4ff86d57f9e5..c0b11b234290 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7495,7 +7495,7 @@ static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, { struct path ns_path; struct inode *ns_inode; - void *error; + int error; error = ns_get_path(&ns_path, task, ns_ops); if (!error) { -- cgit v1.2.3 From bf08949cc8b98b7d1e20cfbba169a5938d42dae8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 3 Dec 2019 15:14:04 +0900 Subject: modules: lockdep: Suppress suspicious RCU usage warning While running kprobe module test, find_module_all() caused a suspicious RCU usage warning. ----- ============================= WARNING: suspicious RCU usage 5.4.0-next-20191202+ #63 Not tainted ----------------------------- kernel/module.c:619 RCU-list traversed in non-reader section!! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by rmmod/642: #0: ffffffff8227da80 (module_mutex){+.+.}, at: __x64_sys_delete_module+0x9a/0x230 stack backtrace: CPU: 0 PID: 642 Comm: rmmod Not tainted 5.4.0-next-20191202+ #63 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x71/0xa0 find_module_all+0xc1/0xd0 __x64_sys_delete_module+0xac/0x230 ? do_syscall_64+0x12/0x1f0 do_syscall_64+0x50/0x1f0 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4b6d49 ----- This is because list_for_each_entry_rcu(modules) is called without rcu_read_lock(). This is safe because the module_mutex is locked. Pass lockdep_is_held(&module_mutex) to the list_for_each_entry_rcu() to suppress this warning, This also fixes similar issue in mod_find() and each_symbol_section(). Signed-off-by: Masami Hiramatsu Signed-off-by: Jessica Yu --- kernel/module.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 3a486f826224..155349275b8e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -214,7 +214,8 @@ static struct module *mod_find(unsigned long addr) { struct module *mod; - list_for_each_entry_rcu(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list, + lockdep_is_held(&module_mutex)) { if (within_module(addr, mod)) return mod; } @@ -448,7 +449,8 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr, if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data)) return true; - list_for_each_entry_rcu(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list, + lockdep_is_held(&module_mutex)) { struct symsearch arr[] = { { mod->syms, mod->syms + mod->num_syms, mod->crcs, NOT_GPL_ONLY, false }, @@ -616,7 +618,8 @@ static struct module *find_module_all(const char *name, size_t len, module_assert_mutex_or_preempt(); - list_for_each_entry_rcu(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list, + lockdep_is_held(&module_mutex)) { if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) continue; if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) -- cgit v1.2.3 From cb5172d96d16df72db8b55146b0ec00bfd97f079 Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Mon, 2 Dec 2019 00:03:48 +0530 Subject: audit: Add __rcu annotation to RCU pointer Add __rcu annotation to RCU-protected global pointer auditd_conn. auditd_conn is an RCU-protected global pointer,i.e., accessed via RCU methods rcu_dereference() and rcu_assign_pointer(), hence it must be annotated with __rcu for sparse to report warnings/errors correctly. Fix multiple instances of the sparse error: error: incompatible types in comparison expression (different address spaces) Reviewed-by: Joel Fernandes (Google) Signed-off-by: Amol Grover [PM: tweak subject line] Signed-off-by: Paul Moore --- kernel/audit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 8e09f0f55b4b..17b0d523afb3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -102,12 +102,13 @@ struct audit_net { * This struct is RCU protected; you must either hold the RCU lock for reading * or the associated spinlock for writing. */ -static struct auditd_connection { +struct auditd_connection { struct pid *pid; u32 portid; struct net *net; struct rcu_head rcu; -} *auditd_conn = NULL; +}; +static struct auditd_connection __rcu *auditd_conn; static DEFINE_SPINLOCK(auditd_conn_lock); /* If audit_rate_limit is non-zero, limit the rate of sending audit records -- cgit v1.2.3 From 15c7c972cd26d89a26788e609c53b5a465324a6c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 Oct 2019 18:53:18 -0700 Subject: rcu: Use *_ONCE() to protect lockless ->expmask accesses The rcu_node structure's ->expmask field is accessed locklessly when starting a new expedited grace period and when reporting an expedited RCU CPU stall warning. This commit therefore handles the former by taking a snapshot of ->expmask while the lock is held and the latter by applying READ_ONCE() to lockless reads and WRITE_ONCE() to the corresponding updates. Link: https://lore.kernel.org/lkml/CANpmjNNmSOagbTpffHr4=Yedckx9Rm2NuGqC9UqE+AOz5f1-ZQ@mail.gmail.com Reported-by: syzbot+134336b86f728d6e55a0@syzkaller.appspotmail.com Signed-off-by: Paul E. McKenney Acked-by: Marco Elver --- kernel/rcu/tree_exp.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d632cd019597..69c5aa64fcfd 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -134,7 +134,7 @@ static void __maybe_unused sync_exp_reset_tree(void) rcu_for_each_node_breadth_first(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); - rnp->expmask = rnp->expmaskinit; + WRITE_ONCE(rnp->expmask, rnp->expmaskinit); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -211,7 +211,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, rnp = rnp->parent; raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ WARN_ON_ONCE(!(rnp->expmask & mask)); - rnp->expmask &= ~mask; + WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); } } @@ -241,7 +241,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } - rnp->expmask &= ~mask; + WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); __rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */ } @@ -372,12 +372,10 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - if (!(mask_ofl_ipi & mask)) - continue; retry_ipi: if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) { mask_ofl_test |= mask; @@ -491,7 +489,7 @@ static void synchronize_sched_expedited_wait(void) struct rcu_data *rdp; mask = leaf_node_cpu_bit(rnp, cpu); - if (!(rnp->expmask & mask)) + if (!(READ_ONCE(rnp->expmask) & mask)) continue; ndetected++; rdp = per_cpu_ptr(&rcu_data, cpu); @@ -503,7 +501,8 @@ static void synchronize_sched_expedited_wait(void) } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rcu_state.expedited_sequence, - rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); + READ_ONCE(rnp_root->expmask), + ".T"[!!rnp_root->exp_tasks]); if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rnp) { @@ -513,7 +512,7 @@ static void synchronize_sched_expedited_wait(void) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, - rnp->expmask, + READ_ONCE(rnp->expmask), ".T"[!!rnp->exp_tasks]); } pr_cont("\n"); @@ -521,7 +520,7 @@ static void synchronize_sched_expedited_wait(void) rcu_for_each_leaf_node(rnp) { for_each_leaf_node_possible_cpu(rnp, cpu) { mask = leaf_node_cpu_bit(rnp, cpu); - if (!(rnp->expmask & mask)) + if (!(READ_ONCE(rnp->expmask) & mask)) continue; dump_cpu_task(cpu); } -- cgit v1.2.3 From 9f08cf088676c12a5b53bd5a29cf04f00c787b5d Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Tue, 8 Oct 2019 13:01:40 +0800 Subject: rcu: Avoid modifying mask_ofl_ipi in sync_rcu_exp_select_node_cpus() The "mask_ofl_ipi" is used to track which CPUs get IPIed, however in the IPI sending loop, "mask_ofl_ipi" along with another variable "mask_ofl_test" might also get modified to record which CPUs' quiesent states must be reported by the sync_rcu_exp_select_node_cpus() at the end of sync_rcu_exp_select_node_cpus(). This overlap of roles can be confusing, so this patch cleans things a little by using "mask_ofl_ipi" solely for determining which CPUs must be IPIed and "mask_ofl_test" for solely determining on behalf of which CPUs sync_rcu_exp_select_node_cpus() must report a quiscent state. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Acked-by: Marco Elver --- kernel/rcu/tree_exp.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 69c5aa64fcfd..6a6f328a5f52 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -387,10 +387,10 @@ retry_ipi: } ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); put_cpu(); - if (!ret) { - mask_ofl_ipi &= ~mask; + /* The CPU will report the QS in response to the IPI. */ + if (!ret) continue; - } + /* Failed, raced with CPU hotplug operation. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); if ((rnp->qsmaskinitnext & mask) && @@ -401,13 +401,12 @@ retry_ipi: schedule_timeout_uninterruptible(1); goto retry_ipi; } - /* CPU really is offline, so we can ignore it. */ - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; + /* CPU really is offline, so we must report its QS. */ + if (rnp->expmask & mask) + mask_ofl_test |= mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; if (mask_ofl_test) rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); } -- cgit v1.2.3 From 6cf539a87a61a4fbc43f625267dbcbcf283872ed Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 9 Oct 2019 17:57:43 +0200 Subject: rcu: Fix data-race due to atomic_t copy-by-value This fixes a data-race where `atomic_t dynticks` is copied by value. The copy is performed non-atomically, resulting in a data-race if `dynticks` is updated concurrently. This data-race was found with KCSAN: ================================================================== BUG: KCSAN: data-race in dyntick_save_progress_counter / rcu_irq_enter write to 0xffff989dbdbe98e0 of 4 bytes by task 10 on cpu 3: atomic_add_return include/asm-generic/atomic-instrumented.h:78 [inline] rcu_dynticks_snap kernel/rcu/tree.c:310 [inline] dyntick_save_progress_counter+0x43/0x1b0 kernel/rcu/tree.c:984 force_qs_rnp+0x183/0x200 kernel/rcu/tree.c:2286 rcu_gp_fqs kernel/rcu/tree.c:1601 [inline] rcu_gp_fqs_loop+0x71/0x880 kernel/rcu/tree.c:1653 rcu_gp_kthread+0x22c/0x3b0 kernel/rcu/tree.c:1799 kthread+0x1b5/0x200 kernel/kthread.c:255 read to 0xffff989dbdbe98e0 of 4 bytes by task 154 on cpu 7: rcu_nmi_enter_common kernel/rcu/tree.c:828 [inline] rcu_irq_enter+0xda/0x240 kernel/rcu/tree.c:870 irq_enter+0x5/0x50 kernel/softirq.c:347 Reported by Kernel Concurrency Sanitizer on: CPU: 7 PID: 154 Comm: kworker/7:1H Not tainted 5.3.0+ #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 Workqueue: kblockd blk_mq_run_work_fn ================================================================== Signed-off-by: Marco Elver Cc: Paul E. McKenney Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Joel Fernandes Cc: Ingo Molnar Cc: Dmitry Vyukov Cc: rcu@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..6145e08a1407 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -577,7 +577,7 @@ static void rcu_eqs_enter(bool user) } lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks); + trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rdp = this_cpu_ptr(&rcu_data); do_nocb_deferred_wakeup(rdp); @@ -650,14 +650,15 @@ static __always_inline void rcu_nmi_exit_common(bool irq) * leave it in non-RCU-idle state. */ if (rdp->dynticks_nmi_nesting != 1) { - trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks); + trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, + atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ rdp->dynticks_nmi_nesting - 2); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks); + trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ if (irq) @@ -744,7 +745,7 @@ static void rcu_eqs_exit(bool user) rcu_dynticks_task_exit(); rcu_dynticks_eqs_exit(); rcu_cleanup_after_idle(); - trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks); + trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdp->dynticks_nesting, 1); WARN_ON_ONCE(rdp->dynticks_nmi_nesting); @@ -833,7 +834,7 @@ static __always_inline void rcu_nmi_enter_common(bool irq) } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, - rdp->dynticks_nmi_nesting + incby, rdp->dynticks); + rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ rdp->dynticks_nmi_nesting + incby); barrier(); -- cgit v1.2.3 From aca2991a25da03ca96127b1d21e1f4aba41f81a6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Oct 2019 06:51:57 -0700 Subject: rcu: Substitute lookup for bit-twiddling in sync_rcu_exp_select_node_cpus() The code in sync_rcu_exp_select_node_cpus() calculates the current CPU's mask within its rcu_node structure's bitmasks, but this has already been computed in the ->grpmask field of that CPU's rcu_data structure. This commit therefore just uses this ->grpmask field. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6a6f328a5f52..3b59c3ee42e5 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -345,8 +345,8 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + unsigned long mask = rdp->grpmask; int snap; if (raw_smp_processor_id() == cpu || @@ -373,8 +373,8 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) /* IPI the remaining CPUs for expedited quiescent state. */ for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + unsigned long mask = rdp->grpmask; retry_ipi: if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) { -- cgit v1.2.3 From fd6bc19d7676a060a171d1cf3dcbf6fd797eb05f Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Tue, 19 Nov 2019 03:17:07 +0000 Subject: rcu: Fix missed wakeup of exp_wq waiters Tasks waiting within exp_funnel_lock() for an expedited grace period to elapse can be starved due to the following sequence of events: 1. Tasks A and B both attempt to start an expedited grace period at about the same time. This grace period will have completed when the lower four bits of the rcu_state structure's ->expedited_sequence field are 0b'0100', for example, when the initial value of this counter is zero. Task A wins, and thus does the actual work of starting the grace period, including acquiring the rcu_state structure's .exp_mutex and sets the counter to 0b'0001'. 2. Because task B lost the race to start the grace period, it waits on ->expedited_sequence to reach 0b'0100' inside of exp_funnel_lock(). This task therefore blocks on the rcu_node structure's ->exp_wq[1] field, keeping in mind that the end-of-grace-period value of ->expedited_sequence (0b'0100') is shifted down two bits before indexing the ->exp_wq[] field. 3. Task C attempts to start another expedited grace period, but blocks on ->exp_mutex, which is still held by Task A. 4. The aforementioned expedited grace period completes, so that ->expedited_sequence now has the value 0b'0100'. A kworker task therefore acquires the rcu_state structure's ->exp_wake_mutex and starts awakening any tasks waiting for this grace period. 5. One of the first tasks awakened happens to be Task A. Task A therefore releases the rcu_state structure's ->exp_mutex, which allows Task C to start the next expedited grace period, which causes the lower four bits of the rcu_state structure's ->expedited_sequence field to become 0b'0101'. 6. Task C's expedited grace period completes, so that the lower four bits of the rcu_state structure's ->expedited_sequence field now become 0b'1000'. 7. The kworker task from step 4 above continues its wakeups. Unfortunately, the wake_up_all() refetches the rcu_state structure's .expedited_sequence field: wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]); This results in the wakeup being applied to the rcu_node structure's ->exp_wq[2] field, which is unfortunate given that Task B is instead waiting on ->exp_wq[1]. On a busy system, no harm is done (or at least no permanent harm is done). Some later expedited grace period will redo the wakeup. But on a quiet system, such as many embedded systems, it might be a good long time before there was another expedited grace period. On such embedded systems, this situation could therefore result in a system hang. This issue manifested as DPM device timeout during suspend (which usually qualifies as a quiet time) due to a SCSI device being stuck in _synchronize_rcu_expedited(), with the following stack trace: schedule() synchronize_rcu_expedited() synchronize_rcu() scsi_device_quiesce() scsi_bus_suspend() dpm_run_callback() __device_suspend() This commit therefore prevents such delays, timeouts, and hangs by making rcu_exp_wait_wake() use its "s" argument consistently instead of refetching from rcu_state.expedited_sequence. Fixes: 3b5f668e715b ("rcu: Overlap wakeups with next expedited grace period") Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 3b59c3ee42e5..fa143e40cd93 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -557,7 +557,7 @@ static void rcu_exp_wait_wake(unsigned long s) spin_unlock(&rnp->exp_lock); } smp_mb(); /* All above changes before wakeup. */ - wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]); + wake_up_all(&rnp->exp_wq[rcu_seq_ctr(s) & 0x3]); } trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake")); mutex_unlock(&rcu_state.exp_wake_mutex); -- cgit v1.2.3 From 4bc6b745e5cbefed92c48071e28a5f41246d0470 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Tue, 19 Nov 2019 11:50:52 -0800 Subject: rcu: Allow only one expedited GP to run concurrently with wakeups The current expedited RCU grace-period code expects that a task requesting an expedited grace period cannot awaken until that grace period has reached the wakeup phase. However, it is possible for a long preemption to result in the waiting task never sleeping. For example, consider the following sequence of events: 1. Task A starts an expedited grace period by invoking synchronize_rcu_expedited(). It proceeds normally up to the wait_event() near the end of that function, and is then preempted (or interrupted or whatever). 2. The expedited grace period completes, and a kworker task starts the awaken phase, having incremented the counter and acquired the rcu_state structure's .exp_wake_mutex. This kworker task is then preempted or interrupted or whatever. 3. Task A resumes and enters wait_event(), which notes that the expedited grace period has completed, and thus doesn't sleep. 4. Task B starts an expedited grace period exactly as did Task A, complete with the preemption (or whatever delay) just before the call to wait_event(). 5. The expedited grace period completes, and another kworker task starts the awaken phase, having incremented the counter. However, it blocks when attempting to acquire the rcu_state structure's .exp_wake_mutex because step 2's kworker task has not yet released it. 6. Steps 4 and 5 repeat, resulting in overflow of the rcu_node structure's ->exp_wq[] array. In theory, this is harmless. Tasks waiting on the various ->exp_wq[] array will just be spuriously awakened, but they will just sleep again on noting that the rcu_state structure's ->expedited_sequence value has not advanced far enough. In practice, this wastes CPU time and is an accident waiting to happen. This commit therefore moves the rcu_exp_gp_seq_end() call that officially ends the expedited grace period (along with associate tracing) until after the ->exp_wake_mutex has been acquired. This prevents Task A from awakening prematurely, thus preventing more than one expedited grace period from being in flight during a previous expedited grace period's wakeup phase. Fixes: 3b5f668e715b ("rcu: Overlap wakeups with next expedited grace period") Signed-off-by: Neeraj Upadhyay [ paulmck: Added updated comment. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index fa143e40cd93..7a1f09376e62 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -539,14 +539,13 @@ static void rcu_exp_wait_wake(unsigned long s) struct rcu_node *rnp; synchronize_sched_expedited_wait(); - rcu_exp_gp_seq_end(); - trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end")); - /* - * Switch over to wakeup mode, allowing the next GP, but -only- the - * next GP, to proceed. - */ + // Switch over to wakeup mode, allowing the next GP to proceed. + // End the previous grace period only after acquiring the mutex + // to ensure that only one GP runs concurrently with wakeups. mutex_lock(&rcu_state.exp_wake_mutex); + rcu_exp_gp_seq_end(); + trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end")); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { -- cgit v1.2.3 From 6c7d7dbf5b7f965eda0d39fbbb8fee005b08f340 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Nov 2019 13:59:37 -0800 Subject: rcu: Rename sync_rcu_preempt_exp_done() to sync_rcu_exp_done() Now that the RCU flavors have been consolidated, there is one common function for checking to see if an expedited RCU grace period has completed, namely sync_rcu_preempt_exp_done(). Because this function is no longer specific to RCU-preempt, this commit removes the "_preempt" from its name. This commit also changes sync_rcu_preempt_exp_done_unlocked() to sync_rcu_exp_done_unlocked() for the same reason. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 19 +++++++++---------- kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 7a1f09376e62..3923c0743c3e 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -148,7 +148,7 @@ static void __maybe_unused sync_exp_reset_tree(void) * * Caller must hold the specificed rcu_node structure's ->lock */ -static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) +static bool sync_rcu_exp_done(struct rcu_node *rnp) { raw_lockdep_assert_held_rcu_node(rnp); @@ -157,17 +157,16 @@ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) } /* - * Like sync_rcu_preempt_exp_done(), but this function assumes the caller - * doesn't hold the rcu_node's ->lock, and will acquire and release the lock - * itself + * Like sync_rcu_exp_done(), but this function assumes the caller doesn't + * hold the rcu_node's ->lock, and will acquire and release the lock itself */ -static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp) +static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp) { unsigned long flags; bool ret; raw_spin_lock_irqsave_rcu_node(rnp, flags); - ret = sync_rcu_preempt_exp_done(rnp); + ret = sync_rcu_exp_done(rnp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return ret; @@ -191,7 +190,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, unsigned long mask; for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) { + if (!sync_rcu_exp_done(rnp)) { if (!rnp->expmask) rcu_initiate_boost(rnp, flags); else @@ -471,9 +470,9 @@ static void synchronize_sched_expedited_wait(void) for (;;) { ret = swait_event_timeout_exclusive( rcu_state.expedited_wq, - sync_rcu_preempt_exp_done_unlocked(rnp_root), + sync_rcu_exp_done_unlocked(rnp_root), jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root)) + if (ret > 0 || sync_rcu_exp_done_unlocked(rnp_root)) return; WARN_ON(ret < 0); /* workqueues should not be signaled. */ if (rcu_cpu_stall_suppress) @@ -507,7 +506,7 @@ static void synchronize_sched_expedited_wait(void) rcu_for_each_node_breadth_first(rnp) { if (rnp == rnp_root) continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done_unlocked(rnp)) + if (sync_rcu_exp_done_unlocked(rnp)) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fa08d55f7040..6dbea4bcf065 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -485,7 +485,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && (!empty_norm || rnp->qsmask)); - empty_exp = sync_rcu_preempt_exp_done(rnp); + empty_exp = sync_rcu_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); @@ -509,7 +509,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, * so we must take a snapshot of the expedited state. */ - empty_exp_now = sync_rcu_preempt_exp_done(rnp); + empty_exp_now = sync_rcu_exp_done(rnp); if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gp_seq, -- cgit v1.2.3 From de8cd0a533bfb57ff4ec6c85e3bdca013a5adcb7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Nov 2019 14:20:41 -0800 Subject: rcu: Update tree_exp.h function-header comments The function-header comments in kernel/rcu/tree_exp.h have gotten a bit out of date, so this commit updates a number of them. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 3923c0743c3e..1eafbcd56679 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -21,7 +21,7 @@ static void rcu_exp_gp_seq_start(void) } /* - * Return then value that expedited-grace-period counter will have + * Return the value that the expedited-grace-period counter will have * at the end of the current grace period. */ static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void) @@ -39,7 +39,9 @@ static void rcu_exp_gp_seq_end(void) } /* - * Take a snapshot of the expedited-grace-period counter. + * Take a snapshot of the expedited-grace-period counter, which is the + * earliest value that will indicate that a full grace period has + * elapsed since the current time. */ static unsigned long rcu_exp_gp_seq_snap(void) { @@ -143,22 +145,18 @@ static void __maybe_unused sync_exp_reset_tree(void) * Return non-zero if there is no RCU expedited grace period in progress * for the specified rcu_node structure, in other words, if all CPUs and * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period. Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold the specificed rcu_node structure's ->lock + * for the current expedited grace period. */ static bool sync_rcu_exp_done(struct rcu_node *rnp) { raw_lockdep_assert_held_rcu_node(rnp); - return rnp->exp_tasks == NULL && READ_ONCE(rnp->expmask) == 0; } /* - * Like sync_rcu_exp_done(), but this function assumes the caller doesn't - * hold the rcu_node's ->lock, and will acquire and release the lock itself + * Like sync_rcu_exp_done(), but where the caller does not hold the + * rcu_node's ->lock. */ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp) { @@ -180,8 +178,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp) * which the task was queued or to one of that rcu_node structure's ancestors, * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) - * - * Caller must hold the specified rcu_node structure's ->lock. */ static void __rcu_report_exp_rnp(struct rcu_node *rnp, bool wake, unsigned long flags) @@ -189,6 +185,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, { unsigned long mask; + raw_lockdep_assert_held_rcu_node(rnp); for (;;) { if (!sync_rcu_exp_done(rnp)) { if (!rnp->expmask) @@ -452,6 +449,10 @@ static void sync_rcu_exp_select_cpus(void) flush_work(&rnp->rew.rew_work); } +/* + * Wait for the expedited grace period to elapse, issuing any needed + * RCU CPU stall warnings along the way. + */ static void synchronize_sched_expedited_wait(void) { int cpu; @@ -781,7 +782,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * implementations, it is still unfriendly to real-time workloads, so is * thus not recommended for any sort of common-case code. In fact, if * you are using synchronize_rcu_expedited() in a loop, please restructure - * your code to batch your updates, and then Use a single synchronize_rcu() + * your code to batch your updates, and then use a single synchronize_rcu() * instead. * * This has the same semantics as (but is more brutal than) synchronize_rcu(). -- cgit v1.2.3 From 28f0361fdfab267a392cd6a6401446c9ea64de95 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Nov 2019 14:24:58 -0800 Subject: rcu: Replace synchronize_sched_expedited_wait() "_sched" with "_rcu" After RCU flavor consolidation, synchronize_sched_expedited_wait() does both RCU-preempt and RCU-sched, whichever happens to have been built into the running kernel. This commit therefore changes this function's name to synchronize_rcu_expedited_wait() to reflect its new generic nature. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1eafbcd56679..081a17942e57 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -453,7 +453,7 @@ static void sync_rcu_exp_select_cpus(void) * Wait for the expedited grace period to elapse, issuing any needed * RCU CPU stall warnings along the way. */ -static void synchronize_sched_expedited_wait(void) +static void synchronize_rcu_expedited_wait(void) { int cpu; unsigned long jiffies_stall; @@ -538,7 +538,7 @@ static void rcu_exp_wait_wake(unsigned long s) { struct rcu_node *rnp; - synchronize_sched_expedited_wait(); + synchronize_rcu_expedited_wait(); // Switch over to wakeup mode, allowing the next GP to proceed. // End the previous grace period only after acquiring the mutex -- cgit v1.2.3 From df1e849ae4559544ff00ff5052eefe2479750539 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Nov 2019 16:36:45 -0800 Subject: rcu: Enable tick for nohz_full CPUs slow to provide expedited QS An expedited grace period can be stalled by a nohz_full CPU looping in kernel context. This possibility is currently handled by some carefully crafted checks in rcu_read_unlock_special() that enlist help from ksoftirqd when permitted by the scheduler. However, it is exactly these checks that require the scheduler avoid holding any of its rq or pi locks across rcu_read_unlock() without also having held them across the entire RCU read-side critical section. It would therefore be very nice if expedited grace periods could handle nohz_full CPUs looping in kernel context without such checks. This commit therefore adds code to the expedited grace period's wait and cleanup code that forces the scheduler-clock interrupt on for CPUs that fail to quickly supply a quiescent state. "Quickly" is currently a hard-coded single-jiffy delay. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 + kernel/rcu/tree_exp.h | 52 ++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 46 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 055c31781d3a..f9253ed406ba 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -182,6 +182,7 @@ struct rcu_data { bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ + bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */ #ifdef CONFIG_RCU_FAST_NO_HZ bool all_lazy; /* All CPU's CBs lazy at idle start? */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 081a17942e57..30b2a02aef39 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -230,7 +230,9 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long mask, bool wake) { + int cpu; unsigned long flags; + struct rcu_data *rdp; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!(rnp->expmask & mask)) { @@ -238,6 +240,13 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, return; } WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); + for_each_leaf_node_cpu_mask(rnp, cpu, mask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (!IS_ENABLED(CONFIG_NO_HZ_FULL) || !rdp->rcu_forced_tick_exp) + continue; + rdp->rcu_forced_tick_exp = false; + tick_dep_clear_cpu(cpu, TICK_DEP_BIT_RCU_EXP); + } __rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */ } @@ -449,6 +458,26 @@ static void sync_rcu_exp_select_cpus(void) flush_work(&rnp->rew.rew_work); } +/* + * Wait for the expedited grace period to elapse, within time limit. + * If the time limit is exceeded without the grace period elapsing, + * return false, otherwise return true. + */ +static bool synchronize_rcu_expedited_wait_once(long tlimit) +{ + int t; + struct rcu_node *rnp_root = rcu_get_root(); + + t = swait_event_timeout_exclusive(rcu_state.expedited_wq, + sync_rcu_exp_done_unlocked(rnp_root), + tlimit); + // Workqueues should not be signaled. + if (t > 0 || sync_rcu_exp_done_unlocked(rnp_root)) + return true; + WARN_ON(t < 0); /* workqueues should not be signaled. */ + return false; +} + /* * Wait for the expedited grace period to elapse, issuing any needed * RCU CPU stall warnings along the way. @@ -460,22 +489,31 @@ static void synchronize_rcu_expedited_wait(void) unsigned long jiffies_start; unsigned long mask; int ndetected; + struct rcu_data *rdp; struct rcu_node *rnp; struct rcu_node *rnp_root = rcu_get_root(); - int ret; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); jiffies_stall = rcu_jiffies_till_stall_check(); jiffies_start = jiffies; + if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (synchronize_rcu_expedited_wait_once(1)) + return; + rcu_for_each_leaf_node(rnp) { + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->rcu_forced_tick_exp) + continue; + rdp->rcu_forced_tick_exp = true; + tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); + } + } + WARN_ON_ONCE(1); + } for (;;) { - ret = swait_event_timeout_exclusive( - rcu_state.expedited_wq, - sync_rcu_exp_done_unlocked(rnp_root), - jiffies_stall); - if (ret > 0 || sync_rcu_exp_done_unlocked(rnp_root)) + if (synchronize_rcu_expedited_wait_once(jiffies_stall)) return; - WARN_ON(ret < 0); /* workqueues should not be signaled. */ if (rcu_cpu_stall_suppress) continue; panic_on_rcu_stall(); -- cgit v1.2.3 From 610dea36d3083a977e4f156206cbe1eaa2a532f0 Mon Sep 17 00:00:00 2001 From: Stefan Reiter Date: Fri, 4 Oct 2019 19:49:10 +0000 Subject: rcu/nocb: Fix dump_tree hierarchy print always active Commit 18cd8c93e69e ("rcu/nocb: Print gp/cb kthread hierarchy if dump_tree") added print statements to rcu_organize_nocb_kthreads for debugging, but incorrectly guarded them, causing the function to always spew out its message. This patch fixes it by guarding both pr_alert statements with dump_tree, while also changing the second pr_alert to a pr_cont, to print the hierarchy in a single line (assuming that's how it was supposed to work). Fixes: 18cd8c93e69e ("rcu/nocb: Print gp/cb kthread hierarchy if dump_tree") Signed-off-by: Stefan Reiter [ paulmck: Make single-nocbs-CPU GP kthreads look less erroneous. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fa08d55f7040..758bfe1de536 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2321,6 +2321,8 @@ static void __init rcu_organize_nocb_kthreads(void) { int cpu; bool firsttime = true; + bool gotnocbs = false; + bool gotnocbscbs = true; int ls = rcu_nocb_gp_stride; int nl = 0; /* Next GP kthread. */ struct rcu_data *rdp; @@ -2343,21 +2345,31 @@ static void __init rcu_organize_nocb_kthreads(void) rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->cpu >= nl) { /* New GP kthread, set up for CBs & next GP. */ + gotnocbs = true; nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; rdp->nocb_gp_rdp = rdp; rdp_gp = rdp; - if (!firsttime && dump_tree) - pr_cont("\n"); - firsttime = false; - pr_alert("%s: No-CB GP kthread CPU %d:", __func__, cpu); + if (dump_tree) { + if (!firsttime) + pr_cont("%s\n", gotnocbscbs + ? "" : " (self only)"); + gotnocbscbs = false; + firsttime = false; + pr_alert("%s: No-CB GP kthread CPU %d:", + __func__, cpu); + } } else { /* Another CB kthread, link to previous GP kthread. */ + gotnocbscbs = true; rdp->nocb_gp_rdp = rdp_gp; rdp_prev->nocb_next_cb_rdp = rdp; - pr_alert(" %d", cpu); + if (dump_tree) + pr_cont(" %d", cpu); } rdp_prev = rdp; } + if (gotnocbs && dump_tree) + pr_cont("%s\n", gotnocbscbs ? "" : " (self only)"); } /* -- cgit v1.2.3 From 6935c3983b246d5fbfebd3b891c825e65c118f2d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 14:21:54 -0700 Subject: rcu: Avoid data-race in rcu_gp_fqs_check_wake() The rcu_gp_fqs_check_wake() function uses rcu_preempt_blocked_readers_cgp() to read ->gp_tasks while other cpus might overwrite this field. We need READ_ONCE()/WRITE_ONCE() pairs to avoid compiler tricks and KCSAN splats like the following : BUG: KCSAN: data-race in rcu_gp_fqs_check_wake / rcu_preempt_deferred_qs_irqrestore write to 0xffffffff85a7f190 of 8 bytes by task 7317 on cpu 0: rcu_preempt_deferred_qs_irqrestore+0x43d/0x580 kernel/rcu/tree_plugin.h:507 rcu_read_unlock_special+0xec/0x370 kernel/rcu/tree_plugin.h:659 __rcu_read_unlock+0xcf/0xe0 kernel/rcu/tree_plugin.h:394 rcu_read_unlock include/linux/rcupdate.h:645 [inline] __ip_queue_xmit+0x3b0/0xa40 net/ipv4/ip_output.c:533 ip_queue_xmit+0x45/0x60 include/net/ip.h:236 __tcp_transmit_skb+0xdeb/0x1cd0 net/ipv4/tcp_output.c:1158 __tcp_send_ack+0x246/0x300 net/ipv4/tcp_output.c:3685 tcp_send_ack+0x34/0x40 net/ipv4/tcp_output.c:3691 tcp_cleanup_rbuf+0x130/0x360 net/ipv4/tcp.c:1575 tcp_recvmsg+0x633/0x1a30 net/ipv4/tcp.c:2179 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838 sock_recvmsg_nosec net/socket.c:871 [inline] sock_recvmsg net/socket.c:889 [inline] sock_recvmsg+0x92/0xb0 net/socket.c:885 sock_read_iter+0x15f/0x1e0 net/socket.c:967 call_read_iter include/linux/fs.h:1864 [inline] new_sync_read+0x389/0x4f0 fs/read_write.c:414 read to 0xffffffff85a7f190 of 8 bytes by task 10 on cpu 1: rcu_gp_fqs_check_wake kernel/rcu/tree.c:1556 [inline] rcu_gp_fqs_check_wake+0x93/0xd0 kernel/rcu/tree.c:1546 rcu_gp_fqs_loop+0x36c/0x580 kernel/rcu/tree.c:1611 rcu_gp_kthread+0x143/0x220 kernel/rcu/tree.c:1768 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 10 Comm: rcu_preempt Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot [ paulmck: Added another READ_ONCE() for RCU CPU stall warnings. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 758bfe1de536..fe5f44811761 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -220,7 +220,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * blocked tasks. */ if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) { - rnp->gp_tasks = &t->rcu_node_entry; + WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry); WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) @@ -340,7 +340,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); */ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) { - return rnp->gp_tasks != NULL; + return READ_ONCE(rnp->gp_tasks) != NULL; } /* Bias and limit values for ->rcu_read_lock_nesting. */ @@ -493,7 +493,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), rnp->gp_seq, t->pid); if (&t->rcu_node_entry == rnp->gp_tasks) - rnp->gp_tasks = np; + WRITE_ONCE(rnp->gp_tasks, np); if (&t->rcu_node_entry == rnp->exp_tasks) rnp->exp_tasks = np; if (IS_ENABLED(CONFIG_RCU_BOOST)) { @@ -663,7 +663,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) dump_blkd_tasks(rnp, 10); if (rcu_preempt_has_tasks(rnp) && (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { - rnp->gp_tasks = rnp->blkd_tasks.next; + WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next); t = container_of(rnp->gp_tasks, struct task_struct, rcu_node_entry); trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), @@ -757,7 +757,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", - __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks); + __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks, + rnp->exp_tasks); pr_info("%s: ->blkd_tasks", __func__); i = 0; list_for_each(lhp, &rnp->blkd_tasks) { -- cgit v1.2.3 From 03bd2983d7a9f898fd89f8f7215c3e56732d8ecd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Oct 2019 09:05:27 -0700 Subject: rcu: Use lockdep rather than comment to enforce lock held The rcu_preempt_check_blocked_tasks() function has a comment that states that the rcu_node structure's ->lock must be held, which might be informative, but which carries little weight if not read. This commit therefore removes this comment in favor of raw_lockdep_assert_held_rcu_node(), which will complain quite visibly if the required lock is not held. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fe5f44811761..ed54d36465e2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -648,8 +648,7 @@ static void rcu_read_unlock_special(struct task_struct *t) * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be - * invoked -before- updating this rnp's ->gp_seq, and the rnp's ->lock - * must be held by the caller. + * invoked -before- updating this rnp's ->gp_seq. * * Also, if there are blocked tasks on the list, they automatically * block the newly created grace period, so set up ->gp_tasks accordingly. @@ -659,6 +658,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) struct task_struct *t; RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); + raw_lockdep_assert_held_rcu_node(rnp); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) dump_blkd_tasks(rnp, 10); if (rcu_preempt_has_tasks(rnp) && -- cgit v1.2.3 From b3e627d3d5092a87fc9b9e37e341610cfecfbfdc Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 02:55:57 +0000 Subject: rcu: Make PREEMPT_RCU be a modifier to TREE_RCU Currently PREEMPT_RCU and TREE_RCU are mutually exclusive Kconfig options. But PREEMPT_RCU actually specifies a kind of TREE_RCU, namely a preemptible TREE_RCU. This commit therefore makes PREEMPT_RCU be a modifer to the TREE_RCU Kconfig option. This has the benefit of simplifying several of the #if expressions that formerly needed to check both, but now need only check one or the other. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 13 +++++++------ kernel/rcu/Makefile | 1 - kernel/rcu/rcu.h | 2 +- kernel/rcu/update.c | 2 +- kernel/sysctl.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 7644eda17d62..0303934e6ef0 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -7,7 +7,7 @@ menu "RCU Subsystem" config TREE_RCU bool - default y if !PREEMPTION && SMP + default y if SMP help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -17,6 +17,7 @@ config TREE_RCU config PREEMPT_RCU bool default y if PREEMPTION + select TREE_RCU help This option selects the RCU implementation that is designed for very large SMP systems with hundreds or @@ -78,7 +79,7 @@ config TASKS_RCU user-mode execution as quiescent states. config RCU_STALL_COMMON - def_bool ( TREE_RCU || PREEMPT_RCU ) + def_bool TREE_RCU help This option enables RCU CPU stall code that is common between the TINY and TREE variants of RCU. The purpose is to allow @@ -86,13 +87,13 @@ config RCU_STALL_COMMON making these warnings mandatory for the tree variants. config RCU_NEED_SEGCBLIST - def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU ) + def_bool ( TREE_RCU || TREE_SRCU ) config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" range 2 64 if 64BIT range 2 32 if !64BIT - depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + depends on TREE_RCU && RCU_EXPERT default 64 if 64BIT default 32 if !64BIT help @@ -112,7 +113,7 @@ config RCU_FANOUT_LEAF int "Tree-based hierarchical RCU leaf-level fanout value" range 2 64 if 64BIT range 2 32 if !64BIT - depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + depends on TREE_RCU && RCU_EXPERT default 16 help This option controls the leaf-level fanout of hierarchical @@ -187,7 +188,7 @@ config RCU_BOOST_DELAY config RCU_NOCB_CPU bool "Offload RCU callback processing from boot-selected CPUs" - depends on TREE_RCU || PREEMPT_RCU + depends on TREE_RCU depends on RCU_EXPERT || NO_HZ_FULL default n help diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 020e8b6a644b..82d5fba48b2f 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -9,6 +9,5 @@ obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o -obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ab504fbc76ca..eabafde2349e 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -454,7 +454,7 @@ enum rcutorture_type { INVALID_RCU_FLAVOR }; -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +#if defined(CONFIG_TREE_RCU) void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq); void do_trace_rcu_torture_read(const char *rcutorturename, diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1861103662db..34a7452b25fd 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -435,7 +435,7 @@ struct debug_obj_descr rcuhead_debug_descr = { EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_RCU_TRACE) void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, unsigned long c_old, unsigned long c) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 70665934d53e..d396aaaf19a3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1268,7 +1268,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_do_static_key, }, #endif -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +#if defined(CONFIG_TREE_RCU) { .procname = "panic_on_rcu_stall", .data = &sysctl_panic_on_rcu_stall, -- cgit v1.2.3 From 90326f0521a88004194f88f1b597b54347482b5c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 15 Oct 2019 21:18:14 +0200 Subject: rcu: Use CONFIG_PREEMPTION where appropriate The config option `CONFIG_PREEMPT' is used for the preemption model "Low-Latency Desktop". The config option `CONFIG_PREEMPTION' is enabled when kernel preemption is enabled which is true for the preemption model `CONFIG_PREEMPT' and `CONFIG_PREEMPT_RT'. Use `CONFIG_PREEMPTION' if it applies to both preemption models and not just to `CONFIG_PREEMPT'. Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Davidlohr Bueso Cc: rcu@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 4 ++-- kernel/rcu/rcutorture.c | 2 +- kernel/rcu/srcutiny.c | 2 +- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_exp.h | 2 +- kernel/rcu/tree_plugin.h | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 0303934e6ef0..1cc940fef17c 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -201,8 +201,8 @@ config RCU_NOCB_CPU specified at boot time by the rcu_nocbs parameter. For each such CPU, a kthread ("rcuox/N") will be created to invoke callbacks, where the "N" is the CPU being offloaded, and where - the "p" for RCU-preempt (PREEMPT kernels) and "s" for RCU-sched - (!PREEMPT kernels). Nothing prevents this kthread from running + the "p" for RCU-preempt (PREEMPTION kernels) and "s" for RCU-sched + (!PREEMPTION kernels). Nothing prevents this kthread from running on the specified CPUs, but (1) the kthreads may be preempted between each callback, and (2) affinity or cgroups can be used to force the kthreads to run on whatever set of CPUs is desired. diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index dee043feb71f..121a0507a7ce 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1730,7 +1730,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) // Give the scheduler a chance, even on nohz_full CPUs. static void rcu_torture_fwd_prog_cond_resched(unsigned long iter) { - if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (IS_ENABLED(CONFIG_PREEMPTION) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { // Real call_rcu() floods hit userspace, so emulate that. if (need_resched() || (iter & 0xfff)) schedule(); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 44d6606b8325..6208c1dae5c9 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* * Workqueue handler to drive one grace period and invoke any callbacks - * that become ready as a result. Single-CPU and !PREEMPT operation + * that become ready as a result. Single-CPU and !PREEMPTION operation * means that we get away with murder on synchronization. ;-) */ void srcu_drive_gp(struct work_struct *wp) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..c9dbb05e4c13 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2698,9 +2698,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu); /* * During early boot, any blocking grace-period wait automatically - * implies a grace period. Later on, this is never the case for PREEMPT. + * implies a grace period. Later on, this is never the case for PREEMPTION. * - * Howevr, because a context switch is a grace period for !PREEMPT, any + * Howevr, because a context switch is a grace period for !PREEMPTION, any * blocking grace-period wait automatically implies a grace period if * there is only one CPU online at any point time during execution of * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d632cd019597..98d078cafa5a 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -670,7 +670,7 @@ static void rcu_exp_handler(void *unused) } } -/* PREEMPT=y, so no PREEMPT=n expedited grace period to clean up after. */ +/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */ static void sync_sched_exp_online_cleanup(int cpu) { } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ed54d36465e2..8cdce111ea73 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -789,7 +789,7 @@ static void __init rcu_bootup_announce(void) } /* - * Note a quiescent state for PREEMPT=n. Because we do not need to know + * Note a quiescent state for PREEMPTION=n. Because we do not need to know * how many quiescent states passed, just if there was at least one since * the start of the grace period, this just sets a flag. The caller must * have disabled preemption. @@ -839,7 +839,7 @@ void rcu_all_qs(void) EXPORT_SYMBOL_GPL(rcu_all_qs); /* - * Note a PREEMPT=n context switch. The caller must have disabled interrupts. + * Note a PREEMPTION=n context switch. The caller must have disabled interrupts. */ void rcu_note_context_switch(bool preempt) { -- cgit v1.2.3 From a289e608b3e740c15f623148c26cdec2d6698ce0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Nov 2019 08:31:56 -0800 Subject: rcutorture: Pull callback forward-progress data into rcu_fwd struct Now that RCU behaves reasonably well with the current single-kthread call_rcu() forward-progress testing, it is time to add more kthreads. This commit takes a first step towards that goal by wrapping what will be the per-kthread data into a new rcu_fwd structure. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 103 +++++++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index dee043feb71f..22a75a4b6b40 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1663,23 +1663,34 @@ struct rcu_fwd_cb { struct rcu_fwd_cb *rfc_next; int rfc_gps; }; -static DEFINE_SPINLOCK(rcu_fwd_lock); -static struct rcu_fwd_cb *rcu_fwd_cb_head; -static struct rcu_fwd_cb **rcu_fwd_cb_tail = &rcu_fwd_cb_head; -static long n_launders_cb; -static unsigned long rcu_fwd_startat; -static bool rcu_fwd_emergency_stop; + #define MAX_FWD_CB_JIFFIES (8 * HZ) /* Maximum CB test duration. */ #define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */ #define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */ #define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */ +#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)) + struct rcu_launder_hist { long n_launders; unsigned long launder_gp_seq; }; -#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)) -static struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST]; -static unsigned long rcu_launder_gp_seq_start; + +struct rcu_fwd { + spinlock_t rcu_fwd_lock; + struct rcu_fwd_cb *rcu_fwd_cb_head; + struct rcu_fwd_cb **rcu_fwd_cb_tail; + long n_launders_cb; + unsigned long rcu_fwd_startat; + struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST]; + unsigned long rcu_launder_gp_seq_start; +}; + +struct rcu_fwd rcu_fwds = { + .rcu_fwd_lock = __SPIN_LOCK_UNLOCKED(rcu_fwds.rcu_fwd_lock), + .rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head, +}; + +bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(void) { @@ -1688,16 +1699,17 @@ static void rcu_torture_fwd_cb_hist(void) int i; int j; - for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) - if (n_launders_hist[i].n_launders > 0) + for (i = ARRAY_SIZE(rcu_fwds.n_launders_hist) - 1; i > 0; i--) + if (rcu_fwds.n_launders_hist[i].n_launders > 0) break; pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", - __func__, jiffies - rcu_fwd_startat); - gps_old = rcu_launder_gp_seq_start; + __func__, jiffies - rcu_fwds.rcu_fwd_startat); + gps_old = rcu_fwds.rcu_launder_gp_seq_start; for (j = 0; j <= i; j++) { - gps = n_launders_hist[j].launder_gp_seq; + gps = rcu_fwds.n_launders_hist[j].launder_gp_seq; pr_cont(" %ds/%d: %ld:%ld", - j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j].n_launders, + j + 1, FWD_CBS_HIST_DIV, + rcu_fwds.n_launders_hist[j].n_launders, rcutorture_seq_diff(gps, gps_old)); gps_old = gps; } @@ -1714,17 +1726,17 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) rfcp->rfc_next = NULL; rfcp->rfc_gps++; - spin_lock_irqsave(&rcu_fwd_lock, flags); - rfcpp = rcu_fwd_cb_tail; - rcu_fwd_cb_tail = &rfcp->rfc_next; + spin_lock_irqsave(&rcu_fwds.rcu_fwd_lock, flags); + rfcpp = rcu_fwds.rcu_fwd_cb_tail; + rcu_fwds.rcu_fwd_cb_tail = &rfcp->rfc_next; WRITE_ONCE(*rfcpp, rfcp); - WRITE_ONCE(n_launders_cb, n_launders_cb + 1); - i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); - if (i >= ARRAY_SIZE(n_launders_hist)) - i = ARRAY_SIZE(n_launders_hist) - 1; - n_launders_hist[i].n_launders++; - n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq(); - spin_unlock_irqrestore(&rcu_fwd_lock, flags); + WRITE_ONCE(rcu_fwds.n_launders_cb, rcu_fwds.n_launders_cb + 1); + i = ((jiffies - rcu_fwds.rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); + if (i >= ARRAY_SIZE(rcu_fwds.n_launders_hist)) + i = ARRAY_SIZE(rcu_fwds.n_launders_hist) - 1; + rcu_fwds.n_launders_hist[i].n_launders++; + rcu_fwds.n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq(); + spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); } // Give the scheduler a chance, even on nohz_full CPUs. @@ -1751,16 +1763,16 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) struct rcu_fwd_cb *rfcp; for (;;) { - spin_lock_irqsave(&rcu_fwd_lock, flags); - rfcp = rcu_fwd_cb_head; + spin_lock_irqsave(&rcu_fwds.rcu_fwd_lock, flags); + rfcp = rcu_fwds.rcu_fwd_cb_head; if (!rfcp) { - spin_unlock_irqrestore(&rcu_fwd_lock, flags); + spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); break; } - rcu_fwd_cb_head = rfcp->rfc_next; - if (!rcu_fwd_cb_head) - rcu_fwd_cb_tail = &rcu_fwd_cb_head; - spin_unlock_irqrestore(&rcu_fwd_lock, flags); + rcu_fwds.rcu_fwd_cb_head = rfcp->rfc_next; + if (!rcu_fwds.rcu_fwd_cb_head) + rcu_fwds.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head; + spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); kfree(rfcp); freed++; rcu_torture_fwd_prog_cond_resched(freed); @@ -1804,8 +1816,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) sd = cur_ops->stall_dur() + 1; sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; dur = sd4 + torture_random(&trs) % (sd - sd4); - WRITE_ONCE(rcu_fwd_startat, jiffies); - stopat = rcu_fwd_startat + dur; + WRITE_ONCE(rcu_fwds.rcu_fwd_startat, jiffies); + stopat = rcu_fwds.rcu_fwd_startat + dur; while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { @@ -1864,23 +1876,23 @@ static void rcu_torture_fwd_prog_cr(void) /* Loop continuously posting RCU callbacks. */ WRITE_ONCE(rcu_fwd_cb_nodelay, true); cur_ops->sync(); /* Later readers see above write. */ - WRITE_ONCE(rcu_fwd_startat, jiffies); - stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES; + WRITE_ONCE(rcu_fwds.rcu_fwd_startat, jiffies); + stopat = rcu_fwds.rcu_fwd_startat + MAX_FWD_CB_JIFFIES; n_launders = 0; - n_launders_cb = 0; + rcu_fwds.n_launders_cb = 0; // Hoist initialization for multi-kthread n_launders_sa = 0; n_max_cbs = 0; n_max_gps = 0; - for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++) - n_launders_hist[i].n_launders = 0; + for (i = 0; i < ARRAY_SIZE(rcu_fwds.n_launders_hist); i++) + rcu_fwds.n_launders_hist[i].n_launders = 0; cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); - rcu_launder_gp_seq_start = gps; + rcu_fwds.rcu_launder_gp_seq_start = gps; tick_dep_set_task(current, TICK_DEP_BIT_RCU); while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { - rfcp = READ_ONCE(rcu_fwd_cb_head); + rfcp = READ_ONCE(rcu_fwds.rcu_fwd_cb_head); rfcpn = NULL; if (rfcp) rfcpn = READ_ONCE(rfcp->rfc_next); @@ -1888,7 +1900,7 @@ static void rcu_torture_fwd_prog_cr(void) if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS && ++n_max_gps >= MIN_FWD_CBS_LAUNDERED) break; - rcu_fwd_cb_head = rfcpn; + rcu_fwds.rcu_fwd_cb_head = rfcpn; n_launders++; n_launders_sa++; } else { @@ -1910,7 +1922,7 @@ static void rcu_torture_fwd_prog_cr(void) } } stoppedat = jiffies; - n_launders_cb_snap = READ_ONCE(n_launders_cb); + n_launders_cb_snap = READ_ONCE(rcu_fwds.n_launders_cb); cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ @@ -1921,7 +1933,8 @@ static void rcu_torture_fwd_prog_cr(void) WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", __func__, - stoppedat - rcu_fwd_startat, jiffies - stoppedat, + stoppedat - rcu_fwds.rcu_fwd_startat, + jiffies - stoppedat, n_launders + n_max_cbs - n_launders_cb_snap, n_launders, n_launders_sa, n_max_gps, n_max_cbs, cver, gps); @@ -1943,7 +1956,7 @@ static int rcutorture_oom_notify(struct notifier_block *self, WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); rcu_torture_fwd_cb_hist(); - rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2); + rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwds.rcu_fwd_startat)) / 2); WRITE_ONCE(rcu_fwd_emergency_stop, true); smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ pr_info("%s: Freed %lu RCU callbacks.\n", -- cgit v1.2.3 From 6b1b832546067caac8c5833abf88fa082d253b2f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Nov 2019 09:08:58 -0800 Subject: rcutorture: Thread rcu_fwd pointer through forward-progress functions In order to add multiple kthreads, it will be necessary to allow the various functions to operate on a pointer to their kthread's rcu_fwd structure. This commit therefore starts the process of adding the needed "struct rcu_fwd" parameters and arguments to the various callback forward-progress functions. Note that rcutorture_oom_notify() and rcu_torture_fwd_cb_hist() will eventually need to iterate over all kthreads' rcu_fwd structures. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 78 ++++++++++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 22a75a4b6b40..cc88ce910a6d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1661,6 +1661,7 @@ static void rcu_torture_fwd_prog_cb(struct rcu_head *rhp) struct rcu_fwd_cb { struct rcu_head rh; struct rcu_fwd_cb *rfc_next; + struct rcu_fwd *rfc_rfp; int rfc_gps; }; @@ -1692,24 +1693,24 @@ struct rcu_fwd rcu_fwds = { bool rcu_fwd_emergency_stop; -static void rcu_torture_fwd_cb_hist(void) +static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) { unsigned long gps; unsigned long gps_old; int i; int j; - for (i = ARRAY_SIZE(rcu_fwds.n_launders_hist) - 1; i > 0; i--) - if (rcu_fwds.n_launders_hist[i].n_launders > 0) + for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--) + if (rfp->n_launders_hist[i].n_launders > 0) break; pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", - __func__, jiffies - rcu_fwds.rcu_fwd_startat); - gps_old = rcu_fwds.rcu_launder_gp_seq_start; + __func__, jiffies - rfp->rcu_fwd_startat); + gps_old = rfp->rcu_launder_gp_seq_start; for (j = 0; j <= i; j++) { - gps = rcu_fwds.n_launders_hist[j].launder_gp_seq; + gps = rfp->n_launders_hist[j].launder_gp_seq; pr_cont(" %ds/%d: %ld:%ld", j + 1, FWD_CBS_HIST_DIV, - rcu_fwds.n_launders_hist[j].n_launders, + rfp->n_launders_hist[j].n_launders, rcutorture_seq_diff(gps, gps_old)); gps_old = gps; } @@ -1723,20 +1724,21 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) int i; struct rcu_fwd_cb *rfcp = container_of(rhp, struct rcu_fwd_cb, rh); struct rcu_fwd_cb **rfcpp; + struct rcu_fwd *rfp = rfcp->rfc_rfp; rfcp->rfc_next = NULL; rfcp->rfc_gps++; - spin_lock_irqsave(&rcu_fwds.rcu_fwd_lock, flags); - rfcpp = rcu_fwds.rcu_fwd_cb_tail; - rcu_fwds.rcu_fwd_cb_tail = &rfcp->rfc_next; + spin_lock_irqsave(&rfp->rcu_fwd_lock, flags); + rfcpp = rfp->rcu_fwd_cb_tail; + rfp->rcu_fwd_cb_tail = &rfcp->rfc_next; WRITE_ONCE(*rfcpp, rfcp); - WRITE_ONCE(rcu_fwds.n_launders_cb, rcu_fwds.n_launders_cb + 1); - i = ((jiffies - rcu_fwds.rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); - if (i >= ARRAY_SIZE(rcu_fwds.n_launders_hist)) - i = ARRAY_SIZE(rcu_fwds.n_launders_hist) - 1; - rcu_fwds.n_launders_hist[i].n_launders++; - rcu_fwds.n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq(); - spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); + WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1); + i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); + if (i >= ARRAY_SIZE(rfp->n_launders_hist)) + i = ARRAY_SIZE(rfp->n_launders_hist) - 1; + rfp->n_launders_hist[i].n_launders++; + rfp->n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq(); + spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags); } // Give the scheduler a chance, even on nohz_full CPUs. @@ -1786,7 +1788,8 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) } /* Carry out need_resched()/cond_resched() forward-progress testing. */ -static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) +static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, + int *tested, int *tested_tries) { unsigned long cver; unsigned long dur; @@ -1816,8 +1819,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) sd = cur_ops->stall_dur() + 1; sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; dur = sd4 + torture_random(&trs) % (sd - sd4); - WRITE_ONCE(rcu_fwds.rcu_fwd_startat, jiffies); - stopat = rcu_fwds.rcu_fwd_startat + dur; + WRITE_ONCE(rfp->rcu_fwd_startat, jiffies); + stopat = rfp->rcu_fwd_startat + dur; while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { @@ -1852,7 +1855,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) } /* Carry out call_rcu() forward-progress testing. */ -static void rcu_torture_fwd_prog_cr(void) +static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) { unsigned long cver; unsigned long flags; @@ -1876,23 +1879,23 @@ static void rcu_torture_fwd_prog_cr(void) /* Loop continuously posting RCU callbacks. */ WRITE_ONCE(rcu_fwd_cb_nodelay, true); cur_ops->sync(); /* Later readers see above write. */ - WRITE_ONCE(rcu_fwds.rcu_fwd_startat, jiffies); - stopat = rcu_fwds.rcu_fwd_startat + MAX_FWD_CB_JIFFIES; + WRITE_ONCE(rfp->rcu_fwd_startat, jiffies); + stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES; n_launders = 0; - rcu_fwds.n_launders_cb = 0; // Hoist initialization for multi-kthread + rfp->n_launders_cb = 0; // Hoist initialization for multi-kthread n_launders_sa = 0; n_max_cbs = 0; n_max_gps = 0; - for (i = 0; i < ARRAY_SIZE(rcu_fwds.n_launders_hist); i++) - rcu_fwds.n_launders_hist[i].n_launders = 0; + for (i = 0; i < ARRAY_SIZE(rfp->n_launders_hist); i++) + rfp->n_launders_hist[i].n_launders = 0; cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); - rcu_fwds.rcu_launder_gp_seq_start = gps; + rfp->rcu_launder_gp_seq_start = gps; tick_dep_set_task(current, TICK_DEP_BIT_RCU); while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { - rfcp = READ_ONCE(rcu_fwds.rcu_fwd_cb_head); + rfcp = READ_ONCE(rfp->rcu_fwd_cb_head); rfcpn = NULL; if (rfcp) rfcpn = READ_ONCE(rfcp->rfc_next); @@ -1900,7 +1903,7 @@ static void rcu_torture_fwd_prog_cr(void) if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS && ++n_max_gps >= MIN_FWD_CBS_LAUNDERED) break; - rcu_fwds.rcu_fwd_cb_head = rfcpn; + rfp->rcu_fwd_cb_head = rfcpn; n_launders++; n_launders_sa++; } else { @@ -1912,6 +1915,7 @@ static void rcu_torture_fwd_prog_cr(void) n_max_cbs++; n_launders_sa = 0; rfcp->rfc_gps = 0; + rfcp->rfc_rfp = rfp; } cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs); @@ -1922,7 +1926,7 @@ static void rcu_torture_fwd_prog_cr(void) } } stoppedat = jiffies; - n_launders_cb_snap = READ_ONCE(rcu_fwds.n_launders_cb); + n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb); cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ @@ -1933,12 +1937,11 @@ static void rcu_torture_fwd_prog_cr(void) WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", __func__, - stoppedat - rcu_fwds.rcu_fwd_startat, - jiffies - stoppedat, + stoppedat - rfp->rcu_fwd_startat, jiffies - stoppedat, n_launders + n_max_cbs - n_launders_cb_snap, n_launders, n_launders_sa, n_max_gps, n_max_cbs, cver, gps); - rcu_torture_fwd_cb_hist(); + rcu_torture_fwd_cb_hist(rfp); } schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ tick_dep_clear_task(current, TICK_DEP_BIT_RCU); @@ -1955,7 +1958,7 @@ static int rcutorture_oom_notify(struct notifier_block *self, { WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); - rcu_torture_fwd_cb_hist(); + rcu_torture_fwd_cb_hist(&rcu_fwds); rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwds.rcu_fwd_startat)) / 2); WRITE_ONCE(rcu_fwd_emergency_stop, true); smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ @@ -1980,6 +1983,7 @@ static struct notifier_block rcutorture_oom_nb = { /* Carry out grace-period forward-progress testing. */ static int rcu_torture_fwd_prog(void *args) { + struct rcu_fwd *rfp = args; int tested = 0; int tested_tries = 0; @@ -1991,8 +1995,8 @@ static int rcu_torture_fwd_prog(void *args) schedule_timeout_interruptible(fwd_progress_holdoff * HZ); WRITE_ONCE(rcu_fwd_emergency_stop, false); register_oom_notifier(&rcutorture_oom_nb); - rcu_torture_fwd_prog_nr(&tested, &tested_tries); - rcu_torture_fwd_prog_cr(); + rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); + rcu_torture_fwd_prog_cr(rfp); unregister_oom_notifier(&rcutorture_oom_nb); /* Avoid slow periods, better to test when busy. */ @@ -2027,7 +2031,7 @@ static int __init rcu_torture_fwd_prog_init(void) if (fwd_progress_div <= 0) fwd_progress_div = 4; return torture_create_kthread(rcu_torture_fwd_prog, - NULL, fwd_prog_task); + &rcu_fwds, fwd_prog_task); } /* Callback function for RCU barrier testing. */ -- cgit v1.2.3 From 7beba0c06b588c725962bcc0273a489d46e81ccf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Nov 2019 07:49:31 -0800 Subject: rcutorture: Move to dynamic initialization of rcu_fwds In order to add multiple call_rcu() forward-progress kthreads, it will be necessary to dynamically allocate and initialize. This commit therefore moves the initialization from compile time to instead immediately precede thread-creation time. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index cc88ce910a6d..6f540fed942c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1686,11 +1686,7 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; -struct rcu_fwd rcu_fwds = { - .rcu_fwd_lock = __SPIN_LOCK_UNLOCKED(rcu_fwds.rcu_fwd_lock), - .rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head, -}; - +struct rcu_fwd rcu_fwds; bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) @@ -2026,6 +2022,8 @@ static int __init rcu_torture_fwd_prog_init(void) WARN_ON(1); /* Make sure rcutorture notices conflict. */ return 0; } + spin_lock_init(&rcu_fwds.rcu_fwd_lock); + rcu_fwds.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head; if (fwd_progress_holdoff <= 0) fwd_progress_holdoff = 1; if (fwd_progress_div <= 0) -- cgit v1.2.3 From 6764100bd2927060aae91b40fce015f39fc4fd87 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Nov 2019 08:20:20 -0800 Subject: rcutorture: Complete threading rcu_fwd pointers through functions This commit threads pointers to rcu_fwd structures through the remaining functions using rcu_fwds directly, namely rcu_torture_fwd_prog_cbfree(), rcutorture_oom_notify() and rcu_torture_fwd_prog_init(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6f540fed942c..394baac98ae0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1754,23 +1754,23 @@ static void rcu_torture_fwd_prog_cond_resched(unsigned long iter) * Free all callbacks on the rcu_fwd_cb_head list, either because the * test is over or because we hit an OOM event. */ -static unsigned long rcu_torture_fwd_prog_cbfree(void) +static unsigned long rcu_torture_fwd_prog_cbfree(struct rcu_fwd *rfp) { unsigned long flags; unsigned long freed = 0; struct rcu_fwd_cb *rfcp; for (;;) { - spin_lock_irqsave(&rcu_fwds.rcu_fwd_lock, flags); - rfcp = rcu_fwds.rcu_fwd_cb_head; + spin_lock_irqsave(&rfp->rcu_fwd_lock, flags); + rfcp = rfp->rcu_fwd_cb_head; if (!rfcp) { - spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); + spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags); break; } - rcu_fwds.rcu_fwd_cb_head = rfcp->rfc_next; - if (!rcu_fwds.rcu_fwd_cb_head) - rcu_fwds.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head; - spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); + rfp->rcu_fwd_cb_head = rfcp->rfc_next; + if (!rfp->rcu_fwd_cb_head) + rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; + spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags); kfree(rfcp); freed++; rcu_torture_fwd_prog_cond_resched(freed); @@ -1926,7 +1926,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ - (void)rcu_torture_fwd_prog_cbfree(); + (void)rcu_torture_fwd_prog_cbfree(rfp); if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) && !shutdown_time_arrived()) { @@ -1952,20 +1952,22 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { + struct rcu_fwd *rfp = &rcu_fwds; + WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); - rcu_torture_fwd_cb_hist(&rcu_fwds); - rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwds.rcu_fwd_startat)) / 2); + rcu_torture_fwd_cb_hist(rfp); + rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2); WRITE_ONCE(rcu_fwd_emergency_stop, true); smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree()); + __func__, rcu_torture_fwd_prog_cbfree(rfp)); rcu_barrier(); pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree()); + __func__, rcu_torture_fwd_prog_cbfree(rfp)); rcu_barrier(); pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree()); + __func__, rcu_torture_fwd_prog_cbfree(rfp)); smp_mb(); /* Frees before return to avoid redoing OOM. */ (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */ pr_info("%s returning after OOM processing.\n", __func__); @@ -2008,6 +2010,8 @@ static int rcu_torture_fwd_prog(void *args) /* If forward-progress checking is requested and feasible, spawn the thread. */ static int __init rcu_torture_fwd_prog_init(void) { + struct rcu_fwd *rfp = &rcu_fwds; + if (!fwd_progress) return 0; /* Not requested, so don't do it. */ if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || @@ -2022,14 +2026,13 @@ static int __init rcu_torture_fwd_prog_init(void) WARN_ON(1); /* Make sure rcutorture notices conflict. */ return 0; } - spin_lock_init(&rcu_fwds.rcu_fwd_lock); - rcu_fwds.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head; + spin_lock_init(&rfp->rcu_fwd_lock); + rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; if (fwd_progress_holdoff <= 0) fwd_progress_holdoff = 1; if (fwd_progress_div <= 0) fwd_progress_div = 4; - return torture_create_kthread(rcu_torture_fwd_prog, - &rcu_fwds, fwd_prog_task); + return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); } /* Callback function for RCU barrier testing. */ -- cgit v1.2.3 From 5155be9994e557618a8312389fb4e52dfbf28a3c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Nov 2019 08:35:08 -0800 Subject: rcutorture: Dynamically allocate rcu_fwds structure This commit switches from static structure to dynamic allocation for rcu_fwds as another step towards providing multiple call_rcu() forward-progress kthreads. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 394baac98ae0..f77f4d886cc1 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1686,7 +1686,7 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; -struct rcu_fwd rcu_fwds; +struct rcu_fwd *rcu_fwds; bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) @@ -1952,7 +1952,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { - struct rcu_fwd *rfp = &rcu_fwds; + struct rcu_fwd *rfp = rcu_fwds; WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); @@ -2010,7 +2010,7 @@ static int rcu_torture_fwd_prog(void *args) /* If forward-progress checking is requested and feasible, spawn the thread. */ static int __init rcu_torture_fwd_prog_init(void) { - struct rcu_fwd *rfp = &rcu_fwds; + struct rcu_fwd *rfp; if (!fwd_progress) return 0; /* Not requested, so don't do it. */ @@ -2026,12 +2026,15 @@ static int __init rcu_torture_fwd_prog_init(void) WARN_ON(1); /* Make sure rcutorture notices conflict. */ return 0; } - spin_lock_init(&rfp->rcu_fwd_lock); - rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; if (fwd_progress_holdoff <= 0) fwd_progress_holdoff = 1; if (fwd_progress_div <= 0) fwd_progress_div = 4; + rfp = kzalloc(sizeof(*rfp), GFP_KERNEL); + if (!rfp) + return -ENOMEM; + spin_lock_init(&rfp->rcu_fwd_lock); + rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); } -- cgit v1.2.3 From a3d1e7eb5abe3aa1095bc75d1a6760d3809bd672 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 18 Nov 2019 09:43:10 -0500 Subject: simple_recursive_removal(): kernel-side rm -rf for ramfs-style filesystems two requirements: no file creations in IS_DEADDIR and no cross-directory renames whatsoever. Signed-off-by: Al Viro --- kernel/trace/trace.c | 4 ++-- kernel/trace/trace_events.c | 6 +++--- kernel/trace/trace_hwlat.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 23459d53d576..84a004638157 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8496,7 +8496,7 @@ static struct trace_array *trace_array_create(const char *name) ret = event_trace_add_tracer(tr->dir, tr); if (ret) { - tracefs_remove_recursive(tr->dir); + tracefs_remove(tr->dir); goto out_free_tr; } @@ -8605,7 +8605,7 @@ static int __remove_instance(struct trace_array *tr) event_trace_del_tracer(tr); ftrace_clear_pids(tr); ftrace_destroy_function_files(tr); - tracefs_remove_recursive(tr->dir); + tracefs_remove(tr->dir); free_trace_buffers(tr); for (i = 0; i < tr->nr_topts; i++) { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c6de3cebc127..1faf3a1e24a4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -697,7 +697,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) return; if (!--dir->nr_events) { - tracefs_remove_recursive(dir->entry); + tracefs_remove(dir->entry); list_del(&dir->list); __put_system_dir(dir); } @@ -716,7 +716,7 @@ static void remove_event_file_dir(struct trace_event_file *file) } spin_unlock(&dir->d_lock); - tracefs_remove_recursive(dir); + tracefs_remove(dir); } list_del(&file->list); @@ -3064,7 +3064,7 @@ int event_trace_del_tracer(struct trace_array *tr) down_write(&trace_event_sem); __trace_remove_event_dirs(tr); - tracefs_remove_recursive(tr->event_dir); + tracefs_remove(tr->event_dir); up_write(&trace_event_sem); tr->event_dir = NULL; diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 6638d63f0921..402d022d0229 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -556,7 +556,7 @@ static int init_tracefs(void) return 0; err: - tracefs_remove_recursive(top_dir); + tracefs_remove(top_dir); return -ENOMEM; } -- cgit v1.2.3 From 07928d9bfc81640bab36f5190e8725894d93b659 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 19 Nov 2019 13:17:31 +0800 Subject: padata: Remove broken queue flushing The function padata_flush_queues is fundamentally broken because it cannot force padata users to complete the request that is underway. IOW padata has to passively wait for the completion of any outstanding work. As it stands flushing is used in two places. Its use in padata_stop is simply unnecessary because nothing depends on the queues to be flushed afterwards. The other use in padata_replace is more substantial as we depend on it to free the old pd structure. This patch instead uses the pd->refcnt to dynamically free the pd structure once all requests are complete. Fixes: 2b73b07ab8a4 ("padata: Flush the padata queues actively") Cc: Signed-off-by: Herbert Xu Reviewed-by: Daniel Jordan Signed-off-by: Herbert Xu --- kernel/padata.c | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index c3fec1413295..da56a235a255 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -35,6 +35,8 @@ #define MAX_OBJ_NUM 1000 +static void padata_free_pd(struct parallel_data *pd); + static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) { int cpu, target_cpu; @@ -283,6 +285,7 @@ static void padata_serial_worker(struct work_struct *serial_work) struct padata_serial_queue *squeue; struct parallel_data *pd; LIST_HEAD(local_list); + int cnt; local_bh_disable(); squeue = container_of(serial_work, struct padata_serial_queue, work); @@ -292,6 +295,8 @@ static void padata_serial_worker(struct work_struct *serial_work) list_replace_init(&squeue->serial.list, &local_list); spin_unlock(&squeue->serial.lock); + cnt = 0; + while (!list_empty(&local_list)) { struct padata_priv *padata; @@ -301,9 +306,12 @@ static void padata_serial_worker(struct work_struct *serial_work) list_del_init(&padata->list); padata->serial(padata); - atomic_dec(&pd->refcnt); + cnt++; } local_bh_enable(); + + if (atomic_sub_and_test(cnt, &pd->refcnt)) + padata_free_pd(pd); } /** @@ -440,7 +448,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, padata_init_squeues(pd); atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); - atomic_set(&pd->refcnt, 0); + atomic_set(&pd->refcnt, 1); spin_lock_init(&pd->lock); pd->cpu = cpumask_first(pd->cpumask.pcpu); INIT_WORK(&pd->reorder_work, invoke_padata_reorder); @@ -466,29 +474,6 @@ static void padata_free_pd(struct parallel_data *pd) kfree(pd); } -/* Flush all objects out of the padata queues. */ -static void padata_flush_queues(struct parallel_data *pd) -{ - int cpu; - struct padata_parallel_queue *pqueue; - struct padata_serial_queue *squeue; - - for_each_cpu(cpu, pd->cpumask.pcpu) { - pqueue = per_cpu_ptr(pd->pqueue, cpu); - flush_work(&pqueue->work); - } - - if (atomic_read(&pd->reorder_objects)) - padata_reorder(pd); - - for_each_cpu(cpu, pd->cpumask.cbcpu) { - squeue = per_cpu_ptr(pd->squeue, cpu); - flush_work(&squeue->work); - } - - BUG_ON(atomic_read(&pd->refcnt) != 0); -} - static void __padata_start(struct padata_instance *pinst) { pinst->flags |= PADATA_INIT; @@ -502,10 +487,6 @@ static void __padata_stop(struct padata_instance *pinst) pinst->flags &= ~PADATA_INIT; synchronize_rcu(); - - get_online_cpus(); - padata_flush_queues(pinst->pd); - put_online_cpus(); } /* Replace the internal control structure with a new one. */ @@ -526,8 +507,8 @@ static void padata_replace(struct padata_instance *pinst, if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu)) notification_mask |= PADATA_CPU_SERIAL; - padata_flush_queues(pd_old); - padata_free_pd(pd_old); + if (atomic_dec_and_test(&pd_old->refcnt)) + padata_free_pd(pd_old); if (notification_mask) blocking_notifier_call_chain(&pinst->cpumask_change_notifier, -- cgit v1.2.3 From 13380a1471aadc517994b7230371a227d1f9f152 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 20 Nov 2019 06:32:50 +0800 Subject: padata: Remove unused padata_remove_cpu The function padata_remove_cpu was supposed to have been removed along with padata_add_cpu but somehow it remained behind. Let's kill it now as it doesn't even have a prototype anymore. Fixes: 815613da6a67 ("kernel/padata.c: removed unused code") Signed-off-by: Herbert Xu Reviewed-by: Daniel Jordan Signed-off-by: Herbert Xu --- kernel/padata.c | 35 ----------------------------------- 1 file changed, 35 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index da56a235a255..fc00f7e64133 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -718,41 +718,6 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) return 0; } - /** - * padata_remove_cpu - remove a cpu from the one or both(serial and parallel) - * padata cpumasks. - * - * @pinst: padata instance - * @cpu: cpu to remove - * @mask: bitmask specifying from which cpumask @cpu should be removed - * The @mask may be any combination of the following flags: - * PADATA_CPU_SERIAL - serial cpumask - * PADATA_CPU_PARALLEL - parallel cpumask - */ -int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask) -{ - int err; - - if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) - return -EINVAL; - - mutex_lock(&pinst->lock); - - get_online_cpus(); - if (mask & PADATA_CPU_SERIAL) - cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu); - if (mask & PADATA_CPU_PARALLEL) - cpumask_clear_cpu(cpu, pinst->cpumask.pcpu); - - err = __padata_remove_cpu(pinst, cpu); - put_online_cpus(); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_remove_cpu); - static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) { return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) || -- cgit v1.2.3 From bbefa1dd6a6d53537c11624752219e39959d04fb Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 Nov 2019 15:58:45 +0800 Subject: crypto: pcrypt - Avoid deadlock by using per-instance padata queues If the pcrypt template is used multiple times in an algorithm, then a deadlock occurs because all pcrypt instances share the same padata_instance, which completes requests in the order submitted. That is, the inner pcrypt request waits for the outer pcrypt request while the outer request is already waiting for the inner. This patch fixes this by allocating a set of queues for each pcrypt instance instead of using two global queues. In order to maintain the existing user-space interface, the pinst structure remains global so any sysfs modifications will apply to every pcrypt instance. Note that when an update occurs we have to allocate memory for every pcrypt instance. Should one of the allocations fail we will abort the update without rolling back changes already made. The new per-instance data structure is called padata_shell and is essentially a wrapper around parallel_data. Reproducer: #include #include #include int main() { struct sockaddr_alg addr = { .salg_type = "aead", .salg_name = "pcrypt(pcrypt(rfc4106-gcm-aesni))" }; int algfd, reqfd; char buf[32] = { 0 }; algfd = socket(AF_ALG, SOCK_SEQPACKET, 0); bind(algfd, (void *)&addr, sizeof(addr)); setsockopt(algfd, SOL_ALG, ALG_SET_KEY, buf, 20); reqfd = accept(algfd, 0, 0); write(reqfd, buf, 32); read(reqfd, buf, 16); } Reported-by: syzbot+56c7151cad94eec37c521f0e47d2eee53f9361c4@syzkaller.appspotmail.com Fixes: 5068c7a883d1 ("crypto: pcrypt - Add pcrypt crypto parallelization wrapper") Signed-off-by: Herbert Xu Tested-by: Eric Biggers Signed-off-by: Herbert Xu --- kernel/padata.c | 236 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 165 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index fc00f7e64133..8c8755f170ca 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -89,7 +89,7 @@ static void padata_parallel_worker(struct work_struct *parallel_work) /** * padata_do_parallel - padata parallelization function * - * @pinst: padata instance + * @ps: padatashell * @padata: object to be parallelized * @cb_cpu: pointer to the CPU that the serialization callback function should * run on. If it's not in the serial cpumask of @pinst @@ -100,16 +100,17 @@ static void padata_parallel_worker(struct work_struct *parallel_work) * Note: Every object which is parallelized by padata_do_parallel * must be seen by padata_do_serial. */ -int padata_do_parallel(struct padata_instance *pinst, +int padata_do_parallel(struct padata_shell *ps, struct padata_priv *padata, int *cb_cpu) { + struct padata_instance *pinst = ps->pinst; int i, cpu, cpu_index, target_cpu, err; struct padata_parallel_queue *queue; struct parallel_data *pd; rcu_read_lock_bh(); - pd = rcu_dereference_bh(pinst->pd); + pd = rcu_dereference_bh(ps->pd); err = -EINVAL; if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) @@ -212,10 +213,10 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd, static void padata_reorder(struct parallel_data *pd) { + struct padata_instance *pinst = pd->ps->pinst; int cb_cpu; struct padata_priv *padata; struct padata_serial_queue *squeue; - struct padata_instance *pinst = pd->pinst; struct padata_parallel_queue *next_queue; /* @@ -349,36 +350,39 @@ void padata_do_serial(struct padata_priv *padata) } EXPORT_SYMBOL(padata_do_serial); -static int padata_setup_cpumasks(struct parallel_data *pd, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) +static int padata_setup_cpumasks(struct padata_instance *pinst) { struct workqueue_attrs *attrs; + int err; + + attrs = alloc_workqueue_attrs(); + if (!attrs) + return -ENOMEM; + + /* Restrict parallel_wq workers to pd->cpumask.pcpu. */ + cpumask_copy(attrs->cpumask, pinst->cpumask.pcpu); + err = apply_workqueue_attrs(pinst->parallel_wq, attrs); + free_workqueue_attrs(attrs); + + return err; +} + +static int pd_setup_cpumasks(struct parallel_data *pd, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) +{ int err = -ENOMEM; if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) goto out; - cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); - if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) goto free_pcpu_mask; - cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask); - attrs = alloc_workqueue_attrs(); - if (!attrs) - goto free_cbcpu_mask; - - /* Restrict parallel_wq workers to pd->cpumask.pcpu. */ - cpumask_copy(attrs->cpumask, pd->cpumask.pcpu); - err = apply_workqueue_attrs(pd->pinst->parallel_wq, attrs); - free_workqueue_attrs(attrs); - if (err < 0) - goto free_cbcpu_mask; + cpumask_copy(pd->cpumask.pcpu, pcpumask); + cpumask_copy(pd->cpumask.cbcpu, cbcpumask); return 0; -free_cbcpu_mask: - free_cpumask_var(pd->cpumask.cbcpu); free_pcpu_mask: free_cpumask_var(pd->cpumask.pcpu); out: @@ -422,12 +426,16 @@ static void padata_init_pqueues(struct parallel_data *pd) } /* Allocate and initialize the internal cpumask dependend resources. */ -static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) +static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) { + struct padata_instance *pinst = ps->pinst; + const struct cpumask *cbcpumask; + const struct cpumask *pcpumask; struct parallel_data *pd; + cbcpumask = pinst->rcpumask.cbcpu; + pcpumask = pinst->rcpumask.pcpu; + pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); if (!pd) goto err; @@ -440,8 +448,8 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, if (!pd->squeue) goto err_free_pqueue; - pd->pinst = pinst; - if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0) + pd->ps = ps; + if (pd_setup_cpumasks(pd, pcpumask, cbcpumask)) goto err_free_squeue; padata_init_pqueues(pd); @@ -490,32 +498,64 @@ static void __padata_stop(struct padata_instance *pinst) } /* Replace the internal control structure with a new one. */ -static void padata_replace(struct padata_instance *pinst, - struct parallel_data *pd_new) +static int padata_replace_one(struct padata_shell *ps) { - struct parallel_data *pd_old = pinst->pd; - int notification_mask = 0; + struct parallel_data *pd_new; - pinst->flags |= PADATA_RESET; + pd_new = padata_alloc_pd(ps); + if (!pd_new) + return -ENOMEM; - rcu_assign_pointer(pinst->pd, pd_new); + ps->opd = rcu_dereference_protected(ps->pd, 1); + rcu_assign_pointer(ps->pd, pd_new); - synchronize_rcu(); + return 0; +} + +static int padata_replace(struct padata_instance *pinst, int cpu) +{ + int notification_mask = 0; + struct padata_shell *ps; + int err; + + pinst->flags |= PADATA_RESET; - if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu)) + cpumask_copy(pinst->omask, pinst->rcpumask.pcpu); + cpumask_and(pinst->rcpumask.pcpu, pinst->cpumask.pcpu, + cpu_online_mask); + if (cpu >= 0) + cpumask_clear_cpu(cpu, pinst->rcpumask.pcpu); + if (!cpumask_equal(pinst->omask, pinst->rcpumask.pcpu)) notification_mask |= PADATA_CPU_PARALLEL; - if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu)) + + cpumask_copy(pinst->omask, pinst->rcpumask.cbcpu); + cpumask_and(pinst->rcpumask.cbcpu, pinst->cpumask.cbcpu, + cpu_online_mask); + if (cpu >= 0) + cpumask_clear_cpu(cpu, pinst->rcpumask.cbcpu); + if (!cpumask_equal(pinst->omask, pinst->rcpumask.cbcpu)) notification_mask |= PADATA_CPU_SERIAL; - if (atomic_dec_and_test(&pd_old->refcnt)) - padata_free_pd(pd_old); + list_for_each_entry(ps, &pinst->pslist, list) { + err = padata_replace_one(ps); + if (err) + break; + } + + synchronize_rcu(); + + list_for_each_entry_continue_reverse(ps, &pinst->pslist, list) + if (atomic_dec_and_test(&ps->opd->refcnt)) + padata_free_pd(ps->opd); if (notification_mask) blocking_notifier_call_chain(&pinst->cpumask_change_notifier, notification_mask, - &pd_new->cpumask); + &pinst->cpumask); pinst->flags &= ~PADATA_RESET; + + return err; } /** @@ -568,7 +608,7 @@ static int __padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t cbcpumask) { int valid; - struct parallel_data *pd; + int err; valid = padata_validate_cpumask(pinst, pcpumask); if (!valid) { @@ -581,19 +621,15 @@ static int __padata_set_cpumasks(struct padata_instance *pinst, __padata_stop(pinst); out_replace: - pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); - if (!pd) - return -ENOMEM; - cpumask_copy(pinst->cpumask.pcpu, pcpumask); cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); - padata_replace(pinst, pd); + err = padata_setup_cpumasks(pinst) ?: padata_replace(pinst, -1); if (valid) __padata_start(pinst); - return 0; + return err; } /** @@ -676,46 +712,32 @@ EXPORT_SYMBOL(padata_stop); static int __padata_add_cpu(struct padata_instance *pinst, int cpu) { - struct parallel_data *pd; + int err = 0; if (cpumask_test_cpu(cpu, cpu_online_mask)) { - pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, - pinst->cpumask.cbcpu); - if (!pd) - return -ENOMEM; - - padata_replace(pinst, pd); + err = padata_replace(pinst, -1); if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) && padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) __padata_start(pinst); } - return 0; + return err; } static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) { - struct parallel_data *pd = NULL; + int err = 0; if (cpumask_test_cpu(cpu, cpu_online_mask)) { - if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) || !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) __padata_stop(pinst); - pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, - pinst->cpumask.cbcpu); - if (!pd) - return -ENOMEM; - - padata_replace(pinst, pd); - - cpumask_clear_cpu(cpu, pd->cpumask.cbcpu); - cpumask_clear_cpu(cpu, pd->cpumask.pcpu); + err = padata_replace(pinst, cpu); } - return 0; + return err; } static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) @@ -763,8 +785,12 @@ static void __padata_free(struct padata_instance *pinst) cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node); #endif + WARN_ON(!list_empty(&pinst->pslist)); + padata_stop(pinst); - padata_free_pd(pinst->pd); + free_cpumask_var(pinst->omask); + free_cpumask_var(pinst->rcpumask.cbcpu); + free_cpumask_var(pinst->rcpumask.pcpu); free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); destroy_workqueue(pinst->serial_wq); @@ -911,7 +937,6 @@ static struct padata_instance *padata_alloc(const char *name, const struct cpumask *cbcpumask) { struct padata_instance *pinst; - struct parallel_data *pd = NULL; pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); if (!pinst) @@ -939,14 +964,22 @@ static struct padata_instance *padata_alloc(const char *name, !padata_validate_cpumask(pinst, cbcpumask)) goto err_free_masks; - pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); - if (!pd) + if (!alloc_cpumask_var(&pinst->rcpumask.pcpu, GFP_KERNEL)) goto err_free_masks; + if (!alloc_cpumask_var(&pinst->rcpumask.cbcpu, GFP_KERNEL)) + goto err_free_rcpumask_pcpu; + if (!alloc_cpumask_var(&pinst->omask, GFP_KERNEL)) + goto err_free_rcpumask_cbcpu; - rcu_assign_pointer(pinst->pd, pd); + INIT_LIST_HEAD(&pinst->pslist); cpumask_copy(pinst->cpumask.pcpu, pcpumask); cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); + cpumask_and(pinst->rcpumask.pcpu, pcpumask, cpu_online_mask); + cpumask_and(pinst->rcpumask.cbcpu, cbcpumask, cpu_online_mask); + + if (padata_setup_cpumasks(pinst)) + goto err_free_omask; pinst->flags = 0; @@ -962,6 +995,12 @@ static struct padata_instance *padata_alloc(const char *name, return pinst; +err_free_omask: + free_cpumask_var(pinst->omask); +err_free_rcpumask_cbcpu: + free_cpumask_var(pinst->rcpumask.cbcpu); +err_free_rcpumask_pcpu: + free_cpumask_var(pinst->rcpumask.pcpu); err_free_masks: free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); @@ -1000,6 +1039,61 @@ void padata_free(struct padata_instance *pinst) } EXPORT_SYMBOL(padata_free); +/** + * padata_alloc_shell - Allocate and initialize padata shell. + * + * @pinst: Parent padata_instance object. + */ +struct padata_shell *padata_alloc_shell(struct padata_instance *pinst) +{ + struct parallel_data *pd; + struct padata_shell *ps; + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto out; + + ps->pinst = pinst; + + get_online_cpus(); + pd = padata_alloc_pd(ps); + put_online_cpus(); + + if (!pd) + goto out_free_ps; + + mutex_lock(&pinst->lock); + RCU_INIT_POINTER(ps->pd, pd); + list_add(&ps->list, &pinst->pslist); + mutex_unlock(&pinst->lock); + + return ps; + +out_free_ps: + kfree(ps); +out: + return NULL; +} +EXPORT_SYMBOL(padata_alloc_shell); + +/** + * padata_free_shell - free a padata shell + * + * @ps: padata shell to free + */ +void padata_free_shell(struct padata_shell *ps) +{ + struct padata_instance *pinst = ps->pinst; + + mutex_lock(&pinst->lock); + list_del(&ps->list); + padata_free_pd(rcu_dereference_protected(ps->pd, 1)); + mutex_unlock(&pinst->lock); + + kfree(ps); +} +EXPORT_SYMBOL(padata_free_shell); + #ifdef CONFIG_HOTPLUG_CPU static __init int padata_driver_init(void) -- cgit v1.2.3 From 894c9ef9780c5cf2f143415e867ee39a33ecb75d Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 3 Dec 2019 14:31:10 -0500 Subject: padata: validate cpumask without removed CPU during offline Configuring an instance's parallel mask without any online CPUs... echo 2 > /sys/kernel/pcrypt/pencrypt/parallel_cpumask echo 0 > /sys/devices/system/cpu/cpu1/online ...makes tcrypt mode=215 crash like this: divide error: 0000 [#1] SMP PTI CPU: 4 PID: 283 Comm: modprobe Not tainted 5.4.0-rc8-padata-doc-v2+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20191013_105130-anatol 04/01/2014 RIP: 0010:padata_do_parallel+0x114/0x300 Call Trace: pcrypt_aead_encrypt+0xc0/0xd0 [pcrypt] crypto_aead_encrypt+0x1f/0x30 do_mult_aead_op+0x4e/0xdf [tcrypt] test_mb_aead_speed.constprop.0.cold+0x226/0x564 [tcrypt] do_test+0x28c2/0x4d49 [tcrypt] tcrypt_mod_init+0x55/0x1000 [tcrypt] ... cpumask_weight() in padata_cpu_hash() returns 0 because the mask has no CPUs. The problem is __padata_remove_cpu() checks for valid masks too early and so doesn't mark the instance PADATA_INVALID as expected, which would have made padata_do_parallel() return error before doing the division. Fix by introducing a second padata CPU hotplug state before CPUHP_BRINGUP_CPU so that __padata_remove_cpu() sees the online mask without @cpu. No need for the second argument to padata_replace() since @cpu is now already missing from the online mask. Fixes: 33e54450683c ("padata: Handle empty padata cpumasks") Signed-off-by: Daniel Jordan Cc: Eric Biggers Cc: Herbert Xu Cc: Sebastian Andrzej Siewior Cc: Steffen Klassert Cc: Thomas Gleixner Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 8c8755f170ca..1e6500d64846 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -512,7 +512,7 @@ static int padata_replace_one(struct padata_shell *ps) return 0; } -static int padata_replace(struct padata_instance *pinst, int cpu) +static int padata_replace(struct padata_instance *pinst) { int notification_mask = 0; struct padata_shell *ps; @@ -523,16 +523,12 @@ static int padata_replace(struct padata_instance *pinst, int cpu) cpumask_copy(pinst->omask, pinst->rcpumask.pcpu); cpumask_and(pinst->rcpumask.pcpu, pinst->cpumask.pcpu, cpu_online_mask); - if (cpu >= 0) - cpumask_clear_cpu(cpu, pinst->rcpumask.pcpu); if (!cpumask_equal(pinst->omask, pinst->rcpumask.pcpu)) notification_mask |= PADATA_CPU_PARALLEL; cpumask_copy(pinst->omask, pinst->rcpumask.cbcpu); cpumask_and(pinst->rcpumask.cbcpu, pinst->cpumask.cbcpu, cpu_online_mask); - if (cpu >= 0) - cpumask_clear_cpu(cpu, pinst->rcpumask.cbcpu); if (!cpumask_equal(pinst->omask, pinst->rcpumask.cbcpu)) notification_mask |= PADATA_CPU_SERIAL; @@ -624,7 +620,7 @@ out_replace: cpumask_copy(pinst->cpumask.pcpu, pcpumask); cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); - err = padata_setup_cpumasks(pinst) ?: padata_replace(pinst, -1); + err = padata_setup_cpumasks(pinst) ?: padata_replace(pinst); if (valid) __padata_start(pinst); @@ -715,7 +711,7 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) int err = 0; if (cpumask_test_cpu(cpu, cpu_online_mask)) { - err = padata_replace(pinst, -1); + err = padata_replace(pinst); if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) && padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) @@ -729,12 +725,12 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) { int err = 0; - if (cpumask_test_cpu(cpu, cpu_online_mask)) { + if (!cpumask_test_cpu(cpu, cpu_online_mask)) { if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) || !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) __padata_stop(pinst); - err = padata_replace(pinst, cpu); + err = padata_replace(pinst); } return err; @@ -761,7 +757,7 @@ static int padata_cpu_online(unsigned int cpu, struct hlist_node *node) return ret; } -static int padata_cpu_prep_down(unsigned int cpu, struct hlist_node *node) +static int padata_cpu_dead(unsigned int cpu, struct hlist_node *node) { struct padata_instance *pinst; int ret; @@ -782,6 +778,7 @@ static enum cpuhp_state hp_online; static void __padata_free(struct padata_instance *pinst) { #ifdef CONFIG_HOTPLUG_CPU + cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, &pinst->node); cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node); #endif @@ -989,6 +986,8 @@ static struct padata_instance *padata_alloc(const char *name, #ifdef CONFIG_HOTPLUG_CPU cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node); + cpuhp_state_add_instance_nocalls_cpuslocked(CPUHP_PADATA_DEAD, + &pinst->node); #endif put_online_cpus(); @@ -1101,17 +1100,24 @@ static __init int padata_driver_init(void) int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online", - padata_cpu_online, - padata_cpu_prep_down); + padata_cpu_online, NULL); if (ret < 0) return ret; hp_online = ret; + + ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead", + NULL, padata_cpu_dead); + if (ret < 0) { + cpuhp_remove_multi_state(hp_online); + return ret; + } return 0; } module_init(padata_driver_init); static __exit void padata_driver_exit(void) { + cpuhp_remove_multi_state(CPUHP_PADATA_DEAD); cpuhp_remove_multi_state(hp_online); } module_exit(padata_driver_exit); -- cgit v1.2.3 From 38228e8848cd7dd86ccb90406af32de0cad24be3 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 3 Dec 2019 14:31:11 -0500 Subject: padata: always acquire cpu_hotplug_lock before pinst->lock lockdep complains when padata's paths to update cpumasks via CPU hotplug and sysfs are both taken: # echo 0 > /sys/devices/system/cpu/cpu1/online # echo ff > /sys/kernel/pcrypt/pencrypt/parallel_cpumask ====================================================== WARNING: possible circular locking dependency detected 5.4.0-rc8-padata-cpuhp-v3+ #1 Not tainted ------------------------------------------------------ bash/205 is trying to acquire lock: ffffffff8286bcd0 (cpu_hotplug_lock.rw_sem){++++}, at: padata_set_cpumask+0x2b/0x120 but task is already holding lock: ffff8880001abfa0 (&pinst->lock){+.+.}, at: padata_set_cpumask+0x26/0x120 which lock already depends on the new lock. padata doesn't take cpu_hotplug_lock and pinst->lock in a consistent order. Which should be first? CPU hotplug calls into padata with cpu_hotplug_lock already held, so it should have priority. Fixes: 6751fb3c0e0c ("padata: Use get_online_cpus/put_online_cpus") Signed-off-by: Daniel Jordan Cc: Eric Biggers Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 1e6500d64846..f5964f015139 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -643,8 +643,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, struct cpumask *serial_mask, *parallel_mask; int err = -EINVAL; - mutex_lock(&pinst->lock); get_online_cpus(); + mutex_lock(&pinst->lock); switch (cpumask_type) { case PADATA_CPU_PARALLEL: @@ -662,8 +662,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask); out: - put_online_cpus(); mutex_unlock(&pinst->lock); + put_online_cpus(); return err; } -- cgit v1.2.3 From 91a71d612128f84f725022d7b7c5d5a741f6fdc7 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 3 Dec 2019 14:31:12 -0500 Subject: padata: remove cpumask change notifier Since commit 63d3578892dc ("crypto: pcrypt - remove padata cpumask notifier") this feature is unused, so get rid of it. Signed-off-by: Daniel Jordan Cc: Eric Biggers Cc: Herbert Xu Cc: Jonathan Corbet Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 52 +--------------------------------------------------- 1 file changed, 1 insertion(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index f5964f015139..bc594c00b26e 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -514,23 +514,16 @@ static int padata_replace_one(struct padata_shell *ps) static int padata_replace(struct padata_instance *pinst) { - int notification_mask = 0; struct padata_shell *ps; int err; pinst->flags |= PADATA_RESET; - cpumask_copy(pinst->omask, pinst->rcpumask.pcpu); cpumask_and(pinst->rcpumask.pcpu, pinst->cpumask.pcpu, cpu_online_mask); - if (!cpumask_equal(pinst->omask, pinst->rcpumask.pcpu)) - notification_mask |= PADATA_CPU_PARALLEL; - cpumask_copy(pinst->omask, pinst->rcpumask.cbcpu); cpumask_and(pinst->rcpumask.cbcpu, pinst->cpumask.cbcpu, cpu_online_mask); - if (!cpumask_equal(pinst->omask, pinst->rcpumask.cbcpu)) - notification_mask |= PADATA_CPU_SERIAL; list_for_each_entry(ps, &pinst->pslist, list) { err = padata_replace_one(ps); @@ -544,48 +537,11 @@ static int padata_replace(struct padata_instance *pinst) if (atomic_dec_and_test(&ps->opd->refcnt)) padata_free_pd(ps->opd); - if (notification_mask) - blocking_notifier_call_chain(&pinst->cpumask_change_notifier, - notification_mask, - &pinst->cpumask); - pinst->flags &= ~PADATA_RESET; return err; } -/** - * padata_register_cpumask_notifier - Registers a notifier that will be called - * if either pcpu or cbcpu or both cpumasks change. - * - * @pinst: A poineter to padata instance - * @nblock: A pointer to notifier block. - */ -int padata_register_cpumask_notifier(struct padata_instance *pinst, - struct notifier_block *nblock) -{ - return blocking_notifier_chain_register(&pinst->cpumask_change_notifier, - nblock); -} -EXPORT_SYMBOL(padata_register_cpumask_notifier); - -/** - * padata_unregister_cpumask_notifier - Unregisters cpumask notifier - * registered earlier using padata_register_cpumask_notifier - * - * @pinst: A pointer to data instance. - * @nlock: A pointer to notifier block. - */ -int padata_unregister_cpumask_notifier(struct padata_instance *pinst, - struct notifier_block *nblock) -{ - return blocking_notifier_chain_unregister( - &pinst->cpumask_change_notifier, - nblock); -} -EXPORT_SYMBOL(padata_unregister_cpumask_notifier); - - /* If cpumask contains no active cpu, we mark the instance as invalid. */ static bool padata_validate_cpumask(struct padata_instance *pinst, const struct cpumask *cpumask) @@ -785,7 +741,6 @@ static void __padata_free(struct padata_instance *pinst) WARN_ON(!list_empty(&pinst->pslist)); padata_stop(pinst); - free_cpumask_var(pinst->omask); free_cpumask_var(pinst->rcpumask.cbcpu); free_cpumask_var(pinst->rcpumask.pcpu); free_cpumask_var(pinst->cpumask.pcpu); @@ -965,8 +920,6 @@ static struct padata_instance *padata_alloc(const char *name, goto err_free_masks; if (!alloc_cpumask_var(&pinst->rcpumask.cbcpu, GFP_KERNEL)) goto err_free_rcpumask_pcpu; - if (!alloc_cpumask_var(&pinst->omask, GFP_KERNEL)) - goto err_free_rcpumask_cbcpu; INIT_LIST_HEAD(&pinst->pslist); @@ -976,11 +929,10 @@ static struct padata_instance *padata_alloc(const char *name, cpumask_and(pinst->rcpumask.cbcpu, cbcpumask, cpu_online_mask); if (padata_setup_cpumasks(pinst)) - goto err_free_omask; + goto err_free_rcpumask_cbcpu; pinst->flags = 0; - BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); @@ -994,8 +946,6 @@ static struct padata_instance *padata_alloc(const char *name, return pinst; -err_free_omask: - free_cpumask_var(pinst->omask); err_free_rcpumask_cbcpu: free_cpumask_var(pinst->rcpumask.cbcpu); err_free_rcpumask_pcpu: -- cgit v1.2.3 From 3facced7aeed131c1002b724e488d68ebe59c56f Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 3 Dec 2019 14:31:13 -0500 Subject: padata: remove reorder_objects reorder_objects is unused since the rework of padata's flushing, so remove it. Signed-off-by: Daniel Jordan Cc: Eric Biggers Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index bc594c00b26e..db950d287b3d 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -202,7 +202,6 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd, if (remove_object) { list_del_init(&padata->list); - atomic_dec(&pd->reorder_objects); ++pd->processed; pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false); } @@ -336,7 +335,6 @@ void padata_do_serial(struct padata_priv *padata) if (cur->seq_nr < padata->seq_nr) break; list_add(&padata->list, &cur->list); - atomic_inc(&pd->reorder_objects); spin_unlock(&pqueue->reorder.lock); /* @@ -455,7 +453,6 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) padata_init_pqueues(pd); padata_init_squeues(pd); atomic_set(&pd->seq_nr, -1); - atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 1); spin_lock_init(&pd->lock); pd->cpu = cpumask_first(pd->cpumask.pcpu); -- cgit v1.2.3 From bfcdcef8c8e3469f4d6c082a1da27a6ef77e5715 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 3 Dec 2019 14:31:14 -0500 Subject: padata: update documentation Remove references to unused functions, standardize language, update to reflect new functionality, migrate to rst format, and fix all kernel-doc warnings. Fixes: 815613da6a67 ("kernel/padata.c: removed unused code") Signed-off-by: Daniel Jordan Cc: Eric Biggers Cc: Herbert Xu Cc: Jonathan Corbet Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Jordan Signed-off-by: Herbert Xu --- kernel/padata.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index db950d287b3d..72777c10bb9c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -2,7 +2,7 @@ /* * padata.c - generic interface to process data streams in parallel * - * See Documentation/padata.txt for an api documentation. + * See Documentation/core-api/padata.rst for more information. * * Copyright (C) 2008, 2009 secunet Security Networks AG * Copyright (C) 2008, 2009 Steffen Klassert @@ -99,6 +99,8 @@ static void padata_parallel_worker(struct work_struct *parallel_work) * The parallelization callback function will run with BHs off. * Note: Every object which is parallelized by padata_do_parallel * must be seen by padata_do_serial. + * + * Return: 0 on success or else negative error code. */ int padata_do_parallel(struct padata_shell *ps, struct padata_priv *padata, int *cb_cpu) @@ -163,14 +165,12 @@ EXPORT_SYMBOL(padata_do_parallel); /* * padata_find_next - Find the next object that needs serialization. * - * Return values are: - * - * A pointer to the control struct of the next object that needs - * serialization, if present in one of the percpu reorder queues. - * - * NULL, if the next object that needs serialization will - * be parallel processed by another cpu and is not yet present in - * the cpu's reorder queue. + * Return: + * * A pointer to the control struct of the next object that needs + * serialization, if present in one of the percpu reorder queues. + * * NULL, if the next object that needs serialization will + * be parallel processed by another cpu and is not yet present in + * the cpu's reorder queue. */ static struct padata_priv *padata_find_next(struct parallel_data *pd, bool remove_object) @@ -582,13 +582,14 @@ out_replace: } /** - * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value - * equivalent to @cpumask. - * + * padata_set_cpumask - Sets specified by @cpumask_type cpumask to the value + * equivalent to @cpumask. * @pinst: padata instance * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding * to parallel and serial cpumasks respectively. * @cpumask: the cpumask to use + * + * Return: 0 on success or negative error code */ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, cpumask_var_t cpumask) @@ -626,6 +627,8 @@ EXPORT_SYMBOL(padata_set_cpumask); * padata_start - start the parallel processing * * @pinst: padata instance to start + * + * Return: 0 on success or negative error code */ int padata_start(struct padata_instance *pinst) { @@ -880,6 +883,8 @@ static struct kobj_type padata_attr_type = { * @name: used to identify the instance * @pcpumask: cpumask that will be used for padata parallelization * @cbcpumask: cpumask that will be used for padata serialization + * + * Return: new instance on success, NULL on error */ static struct padata_instance *padata_alloc(const char *name, const struct cpumask *pcpumask, @@ -967,6 +972,8 @@ err: * parallel workers. * * @name: used to identify the instance + * + * Return: new instance on success, NULL on error */ struct padata_instance *padata_alloc_possible(const char *name) { @@ -977,7 +984,7 @@ EXPORT_SYMBOL(padata_alloc_possible); /** * padata_free - free a padata instance * - * @padata_inst: padata instance to free + * @pinst: padata instance to free */ void padata_free(struct padata_instance *pinst) { @@ -989,6 +996,8 @@ EXPORT_SYMBOL(padata_free); * padata_alloc_shell - Allocate and initialize padata shell. * * @pinst: Parent padata_instance object. + * + * Return: new shell on success, NULL on error */ struct padata_shell *padata_alloc_shell(struct padata_instance *pinst) { -- cgit v1.2.3 From bae141f54be83b06652c1d47e50e4e75ed4e9c7e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 6 Dec 2019 22:49:34 +0100 Subject: bpf: Emit audit messages upon successful prog load and unload Allow for audit messages to be emitted upon BPF program load and unload for having a timeline of events. The load itself is in syscall context, so additional info about the process initiating the BPF prog creation can be logged and later directly correlated to the unload event. The only info really needed from BPF side is the globally unique prog ID where then audit user space tooling can query / dump all info needed about the specific BPF program right upon load event and enrich the record, thus these changes needed here can be kept small and non-intrusive to the core. Raw example output: # auditctl -D # auditctl -a always,exit -F arch=x86_64 -S bpf # ausearch --start recent -m 1334 ... ---- time->Wed Nov 27 16:04:13 2019 type=PROCTITLE msg=audit(1574867053.120:84664): proctitle="./bpf" type=SYSCALL msg=audit(1574867053.120:84664): arch=c000003e syscall=321 \ success=yes exit=3 a0=5 a1=7ffea484fbe0 a2=70 a3=0 items=0 ppid=7477 \ pid=12698 auid=1001 uid=1001 gid=1001 euid=1001 suid=1001 fsuid=1001 \ egid=1001 sgid=1001 fsgid=1001 tty=pts2 ses=4 comm="bpf" \ exe="/home/jolsa/auditd/audit-testsuite/tests/bpf/bpf" \ subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1334] msg=audit(1574867053.120:84664): prog-id=76 op=LOAD ---- time->Wed Nov 27 16:04:13 2019 type=UNKNOWN[1334] msg=audit(1574867053.120:84665): prog-id=76 op=UNLOAD ... Signed-off-by: Daniel Borkmann Co-developed-by: Jiri Olsa Signed-off-by: Jiri Olsa Acked-by: Paul Moore Link: https://lore.kernel.org/bpf/20191206214934.11319-1-jolsa@kernel.org --- kernel/bpf/syscall.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e3461ec59570..66b90eaf99fe 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -1306,6 +1307,36 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) return 0; } +enum bpf_audit { + BPF_AUDIT_LOAD, + BPF_AUDIT_UNLOAD, + BPF_AUDIT_MAX, +}; + +static const char * const bpf_audit_str[BPF_AUDIT_MAX] = { + [BPF_AUDIT_LOAD] = "LOAD", + [BPF_AUDIT_UNLOAD] = "UNLOAD", +}; + +static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) +{ + struct audit_context *ctx = NULL; + struct audit_buffer *ab; + + if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX)) + return; + if (audit_enabled == AUDIT_OFF) + return; + if (op == BPF_AUDIT_LOAD) + ctx = audit_context(); + ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); + if (unlikely(!ab)) + return; + audit_log_format(ab, "prog-id=%u op=%s", + prog->aux->id, bpf_audit_str[op]); + audit_log_end(ab); +} + int __bpf_prog_charge(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -1421,6 +1452,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic64_dec_and_test(&prog->aux->refcnt)) { perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); + bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); __bpf_prog_put_noref(prog, true); @@ -1830,6 +1862,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) */ bpf_prog_kallsyms_add(prog); perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); + bpf_audit_prog(prog, BPF_AUDIT_LOAD); err = bpf_prog_new_fd(prog); if (err < 0) -- cgit v1.2.3 From 81c22041d9f19df07b9cba95e3cd02e0f41bc1e1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 9 Dec 2019 16:08:03 +0100 Subject: bpf, x86, arm64: Enable jit by default when not built as always-on After Spectre 2 fix via 290af86629b2 ("bpf: introduce BPF_JIT_ALWAYS_ON config") most major distros use BPF_JIT_ALWAYS_ON configuration these days which compiles out the BPF interpreter entirely and always enables the JIT. Also given recent fix in e1608f3fa857 ("bpf: Avoid setting bpf insns pages read-only when prog is jited"), we additionally avoid fragmenting the direct map for the BPF insns pages sitting in the general data heap since they are not used during execution. Latter is only needed when run through the interpreter. Since both x86 and arm64 JITs have seen a lot of exposure over the years, are generally most up to date and maintained, there is more downside in !BPF_JIT_ALWAYS_ON configurations to have the interpreter enabled by default rather than the JIT. Add a ARCH_WANT_DEFAULT_BPF_JIT config which archs can use to set the bpf_jit_{enable,kallsyms} to 1. Back in the days the bpf_jit_kallsyms knob was set to 0 by default since major distros still had /proc/kallsyms addresses exposed to unprivileged user space which is not the case anymore. Hence both knobs are set via BPF_JIT_DEFAULT_ON which is set to 'y' in case of BPF_JIT_ALWAYS_ON or ARCH_WANT_DEFAULT_BPF_JIT. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Will Deacon Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/f78ad24795c2966efcc2ee19025fa3459f622185.1575903816.git.daniel@iogearbox.net --- kernel/bpf/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 49e32acad7d8..2ff01a716128 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -520,9 +520,9 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) #ifdef CONFIG_BPF_JIT /* All BPF JIT sysctl knobs here. */ -int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); +int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); +int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_harden __read_mostly; -int bpf_jit_kallsyms __read_mostly; long bpf_jit_limit __read_mostly; static __always_inline void -- cgit v1.2.3 From c30fe541896440667bd9e9068aedd1d440fbbcd2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Oct 2019 21:40:09 -0700 Subject: rcu: Mark non-global functions and variables as static Each of rcu_state, rcu_rnp_online_cpus(), rcu_dynticks_curr_cpu_in_eqs(), and rcu_dynticks_snap() are used only in the kernel/rcu/tree.o translation unit, and may thus be marked static. This commit therefore makes this change. Reported-by: Ben Dooks Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) --- kernel/rcu/tree.c | 8 ++++---- kernel/rcu/tree.h | 2 -- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..dd8cfc34f4da 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -84,7 +84,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), }; -struct rcu_state rcu_state = { +static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, .gp_state = RCU_GP_IDLE, .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, @@ -188,7 +188,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio); * held, but the bit corresponding to the current CPU will be stable * in most contexts. */ -unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) +static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) { return READ_ONCE(rnp->qsmaskinitnext); } @@ -294,7 +294,7 @@ static void rcu_dynticks_eqs_online(void) * * No ordering, as we are sampling CPU-local information. */ -bool rcu_dynticks_curr_cpu_in_eqs(void) +static bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -305,7 +305,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void) * Snapshot the ->dynticks counter with full ordering so as to allow * stable comparison of this counter with past and future snapshots. */ -int rcu_dynticks_snap(struct rcu_data *rdp) +static int rcu_dynticks_snap(struct rcu_data *rdp) { int snap = atomic_add_return(0, &rdp->dynticks); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 055c31781d3a..e4dc5debfc84 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -403,8 +403,6 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; #define RCU_NAME rcu_name #endif /* #else #ifdef CONFIG_TRACING */ -int rcu_dynticks_snap(struct rcu_data *rdp); - /* Forward declarations for tree_plugin.h */ static void rcu_bootup_announce(void); static void rcu_qs(void); -- cgit v1.2.3 From 98e8627efcada18ac043a77b9101b4b4c768090b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 13 Dec 2019 18:51:07 +0100 Subject: bpf: Move trampoline JIT image allocation to a function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor the image allocation in the BPF trampoline code into a separate function, so it can be shared with the BPF dispatcher in upcoming commits. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191213175112.30208-2-bjorn.topel@gmail.com --- kernel/bpf/trampoline.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 7e89f1f49d77..5ee301ddbd00 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -13,6 +13,22 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); +void *bpf_jit_alloc_exec_page(void) +{ + void *image; + + image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!image) + return NULL; + + set_vm_flush_reset_perms(image); + /* Keep image as writeable. The alternative is to keep flipping ro/rw + * everytime new program is attached or detached. + */ + set_memory_x((long)image, 1); + return image; +} + struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; @@ -33,7 +49,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) goto out; /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ - image = bpf_jit_alloc_exec(PAGE_SIZE); + image = bpf_jit_alloc_exec_page(); if (!image) { kfree(tr); tr = NULL; @@ -47,12 +63,6 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); - - set_vm_flush_reset_perms(image); - /* Keep image as writeable. The alternative is to keep flipping ro/rw - * everytime new program is attached or detached. - */ - set_memory_x((long)image, 1); tr->image = image; out: mutex_unlock(&trampoline_mutex); -- cgit v1.2.3 From 75ccbef6369e94ecac696a152a998a978d41376b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 13 Dec 2019 18:51:08 +0100 Subject: bpf: Introduce BPF dispatcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The BPF dispatcher is a multi-way branch code generator, mainly targeted for XDP programs. When an XDP program is executed via the bpf_prog_run_xdp(), it is invoked via an indirect call. The indirect call has a substantial performance impact, when retpolines are enabled. The dispatcher transform indirect calls to direct calls, and therefore avoids the retpoline. The dispatcher is generated using the BPF JIT, and relies on text poking provided by bpf_arch_text_poke(). The dispatcher hijacks a trampoline function it via the __fentry__ nop of the trampoline. One dispatcher instance currently supports up to 64 dispatch points. A user creates a dispatcher with its corresponding trampoline with the DEFINE_BPF_DISPATCHER macro. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191213175112.30208-3-bjorn.topel@gmail.com --- kernel/bpf/Makefile | 1 + kernel/bpf/dispatcher.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 kernel/bpf/dispatcher.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 3f671bf617e8..d4f330351f87 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o +obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c new file mode 100644 index 000000000000..204ee61a3904 --- /dev/null +++ b/kernel/bpf/dispatcher.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2019 Intel Corporation. */ + +#include +#include +#include + +/* The BPF dispatcher is a multiway branch code generator. The + * dispatcher is a mechanism to avoid the performance penalty of an + * indirect call, which is expensive when retpolines are enabled. A + * dispatch client registers a BPF program into the dispatcher, and if + * there is available room in the dispatcher a direct call to the BPF + * program will be generated. All calls to the BPF programs called via + * the dispatcher will then be a direct call, instead of an + * indirect. The dispatcher hijacks a trampoline function it via the + * __fentry__ of the trampoline. The trampoline function has the + * following signature: + * + * unsigned int trampoline(const void *ctx, const struct bpf_insn *insnsi, + * unsigned int (*bpf_func)(const void *, + * const struct bpf_insn *)); + */ + +static struct bpf_dispatcher_prog *bpf_dispatcher_find_prog( + struct bpf_dispatcher *d, struct bpf_prog *prog) +{ + int i; + + for (i = 0; i < BPF_DISPATCHER_MAX; i++) { + if (prog == d->progs[i].prog) + return &d->progs[i]; + } + return NULL; +} + +static struct bpf_dispatcher_prog *bpf_dispatcher_find_free( + struct bpf_dispatcher *d) +{ + return bpf_dispatcher_find_prog(d, NULL); +} + +static bool bpf_dispatcher_add_prog(struct bpf_dispatcher *d, + struct bpf_prog *prog) +{ + struct bpf_dispatcher_prog *entry; + + if (!prog) + return false; + + entry = bpf_dispatcher_find_prog(d, prog); + if (entry) { + refcount_inc(&entry->users); + return false; + } + + entry = bpf_dispatcher_find_free(d); + if (!entry) + return false; + + bpf_prog_inc(prog); + entry->prog = prog; + refcount_set(&entry->users, 1); + d->num_progs++; + return true; +} + +static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d, + struct bpf_prog *prog) +{ + struct bpf_dispatcher_prog *entry; + + if (!prog) + return false; + + entry = bpf_dispatcher_find_prog(d, prog); + if (!entry) + return false; + + if (refcount_dec_and_test(&entry->users)) { + entry->prog = NULL; + bpf_prog_put(prog); + d->num_progs--; + return true; + } + return false; +} + +int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs) +{ + return -ENOTSUPP; +} + +static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image) +{ + s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0]; + int i; + + for (i = 0; i < BPF_DISPATCHER_MAX; i++) { + if (d->progs[i].prog) + *ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func; + } + return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs); +} + +static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) +{ + void *old, *new; + u32 noff; + int err; + + if (!prev_num_progs) { + old = NULL; + noff = 0; + } else { + old = d->image + d->image_off; + noff = d->image_off ^ (PAGE_SIZE / 2); + } + + new = d->num_progs ? d->image + noff : NULL; + if (new) { + if (bpf_dispatcher_prepare(d, new)) + return; + } + + err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new); + if (err || !new) + return; + + d->image_off = noff; +} + +void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, + struct bpf_prog *to) +{ + bool changed = false; + int prev_num_progs; + + if (from == to) + return; + + mutex_lock(&d->mutex); + if (!d->image) { + d->image = bpf_jit_alloc_exec_page(); + if (!d->image) + goto out; + } + + prev_num_progs = d->num_progs; + changed |= bpf_dispatcher_remove_prog(d, from); + changed |= bpf_dispatcher_add_prog(d, to); + + if (!changed) + goto out; + + bpf_dispatcher_update(d, prev_num_progs); +out: + mutex_unlock(&d->mutex); +} -- cgit v1.2.3 From 7e6897f95935973c3253fd756135b5ea58043dc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 13 Dec 2019 18:51:09 +0100 Subject: bpf, xdp: Start using the BPF dispatcher for XDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds a BPF dispatcher for XDP. The dispatcher is updated from the XDP control-path, dev_xdp_install(), and used when an XDP program is run via bpf_prog_run_xdp(). Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191213175112.30208-4-bjorn.topel@gmail.com --- kernel/bpf/syscall.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 66b90eaf99fe..b08c362f4e02 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2338,17 +2338,12 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id -static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) +struct bpf_prog *bpf_prog_by_id(u32 id) { struct bpf_prog *prog; - u32 id = attr->prog_id; - int fd; - - if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (!id) + return ERR_PTR(-ENOENT); spin_lock_bh(&prog_idr_lock); prog = idr_find(&prog_idr, id); @@ -2357,7 +2352,22 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) else prog = ERR_PTR(-ENOENT); spin_unlock_bh(&prog_idr_lock); + return prog; +} + +static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + u32 id = attr->prog_id; + int fd; + + if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + prog = bpf_prog_by_id(id); if (IS_ERR(prog)) return PTR_ERR(prog); -- cgit v1.2.3 From 7c2e8bbd87db661122e92d71a394dd7bb3ada4d3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 3 Dec 2019 17:01:05 +0100 Subject: sched: Spare resched IPI when prio changes on a single fair task The runqueue of a fair task being remotely reniced is going to get a resched IPI in order to reassess which task should be the current running on the CPU. However that evaluation is useless if the fair task is running alone, in which case we can spare that IPI, preventing nohz_full CPUs from being disturbed. Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Link: https://lkml.kernel.org/r/20191203160106.18806-2-frederic@kernel.org --- kernel/sched/fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 08a233e97a01..846f50bd0c0b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10322,6 +10322,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; + if (rq->cfs.nr_running == 1) + return; + /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on -- cgit v1.2.3 From 5443a0be6121d557e12951537e10159e4c61035d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 3 Dec 2019 17:01:06 +0100 Subject: sched: Use fair:prio_changed() instead of ad-hoc implementation set_user_nice() implements its own version of fair::prio_changed() and therefore misses a specific optimization towards nohz_full CPUs that avoid sending an resched IPI to a reniced task running alone. Use the proper callback instead. Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Link: https://lkml.kernel.org/r/20191203160106.18806-3-frederic@kernel.org --- kernel/sched/core.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 90e4b00ace89..15508c202bf5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4540,17 +4540,17 @@ void set_user_nice(struct task_struct *p, long nice) p->prio = effective_prio(p); delta = p->prio - old_prio; - if (queued) { + if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_curr(rq); - } if (running) set_next_task(rq, p); + + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + p->sched_class->prio_changed(rq, p, old_prio); + out_unlock: task_rq_unlock(rq, p, &rf); } -- cgit v1.2.3 From cde65194502778665c1b52afc5722cf7dbfaa399 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 10 Dec 2019 20:19:03 +0100 Subject: sched/wait: fix ___wait_var_event(exclusive) init_wait_var_entry() forgets to initialize wq_entry->flags. Currently not a problem, we don't have wait_var_event_exclusive(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Vincent Guittot Cc: Ingo Molnar Cc: Felipe Balbi Cc: Linus Torvalds Cc: Miklos Szeredi Cc: Juri Lelli Link: https://lkml.kernel.org/r/20191210191902.GB14449@redhat.com --- kernel/sched/wait_bit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 45eba18a2898..02ce292b9bc0 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -179,6 +179,7 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int .bit_nr = -1, }, .wq_entry = { + .flags = flags, .private = current, .func = var_wake_function, .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry), -- cgit v1.2.3 From 45178ac0cea853fe0e405bf11e101bdebea57b15 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Dec 2019 09:34:54 +0100 Subject: cpu/hotplug, stop_machine: Fix stop_machine vs hotplug order Paul reported a very sporadic, rcutorture induced, workqueue failure. When the planets align, the workqueue rescuer's self-migrate fails and then triggers a WARN for running a work on the wrong CPU. Tejun then figured that set_cpus_allowed_ptr()'s stop_one_cpu() call could be ignored! When stopper->enabled is false, stop_machine will insta complete the work, without actually doing the work. Worse, it will not WARN about this (we really should fix this). It turns out there is a small window where a freshly online'ed CPU is marked 'online' but doesn't yet have the stopper task running: BP AP bringup_cpu() __cpu_up(cpu, idle) --> start_secondary() ... cpu_startup_entry() bringup_wait_for_ap() wait_for_ap_thread() <-- cpuhp_online_idle() while (1) do_idle() ... available to run kthreads ... stop_machine_unpark() stopper->enable = true; Close this by moving the stop_machine_unpark() into cpuhp_online_idle(), such that the stopper thread is ready before we start the idle loop and schedule. Reported-by: "Paul E. McKenney" Debugged-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Tested-by: "Paul E. McKenney" --- kernel/cpu.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index a59cc980adad..e7f79674824d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -525,8 +525,7 @@ static int bringup_wait_for_ap(unsigned int cpu) if (WARN_ON_ONCE((!cpu_online(cpu)))) return -ECANCELED; - /* Unpark the stopper thread and the hotplug thread of the target cpu */ - stop_machine_unpark(cpu); + /* Unpark the hotplug thread of the target cpu */ kthread_unpark(st->thread); /* @@ -1089,8 +1088,8 @@ void notify_cpu_starting(unsigned int cpu) /* * Called from the idle task. Wake up the controlling task which brings the - * stopper and the hotplug thread of the upcoming CPU up and then delegates - * the rest of the online bringup to the hotplug thread. + * hotplug thread of the upcoming CPU up and then delegates the rest of the + * online bringup to the hotplug thread. */ void cpuhp_online_idle(enum cpuhp_state state) { @@ -1100,6 +1099,12 @@ void cpuhp_online_idle(enum cpuhp_state state) if (state != CPUHP_AP_ONLINE_IDLE) return; + /* + * Unpart the stopper thread before we start the idle loop (and start + * scheduling); this ensures the stopper task is always available. + */ + stop_machine_unpark(smp_processor_id()); + st->state = CPUHP_AP_ONLINE_IDLE; complete_ap_thread(st, true); } -- cgit v1.2.3 From 60588bfa223ff675b95f866249f90616613fbe31 Mon Sep 17 00:00:00 2001 From: Cheng Jian Date: Fri, 13 Dec 2019 10:45:30 +0800 Subject: sched/fair: Optimize select_idle_cpu select_idle_cpu() will scan the LLC domain for idle CPUs, it's always expensive. so the next commit : 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()") introduces a way to limit how many CPUs we scan. But it consume some CPUs out of 'nr' that are not allowed for the task and thus waste our attempts. The function always return nr_cpumask_bits, and we can't find a CPU which our task is allowed to run. Cpumask may be too big, similar to select_idle_core(), use per_cpu_ptr 'select_idle_mask' to prevent stack overflow. Fixes: 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()") Signed-off-by: Cheng Jian Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Srikar Dronamraju Reviewed-by: Vincent Guittot Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20191213024530.28052-1-cj.chengjian@huawei.com --- kernel/sched/fair.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 846f50bd0c0b..280d54ccb4be 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5828,6 +5828,7 @@ static inline int select_idle_smt(struct task_struct *p, int target) */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); struct sched_domain *this_sd; u64 avg_cost, avg_idle; u64 time, cost; @@ -5859,11 +5860,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t time = cpu_clock(this); - for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + + for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) return si_cpu; - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) - continue; if (available_idle_cpu(cpu)) break; if (si_cpu == -1 && sched_idle_cpu(cpu)) -- cgit v1.2.3 From d040e0734fb3dedfe24c3d94f5a32b4812eca610 Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Fri, 13 Dec 2019 11:45:40 +0800 Subject: schied/fair: Skip calculating @contrib without load Because of the: if (!load) runnable = running = 0; clause in ___update_load_sum(), all the actual users of @contrib in accumulate_sum(): if (load) sa->load_sum += load * contrib; if (runnable) sa->runnable_load_sum += runnable * contrib; if (running) sa->util_sum += contrib << SCHED_CAPACITY_SHIFT; don't happen, and therefore we don't care what @contrib actually is and calculating it is pointless. If we count the times when @load equals zero and not as below: if (load) { load_is_not_zero_count++; contrib = __accumulate_pelt_segments(periods, 1024 - sa->period_contrib,delta); } else load_is_zero_count++; As we can see, load_is_zero_count is much bigger than load_is_zero_count, and the gap is gradually widening: load_is_zero_count: 6016044 times load_is_not_zero_count: 244316 times 19:50:43 up 1 min, 1 user, load average: 0.09, 0.06, 0.02 load_is_zero_count: 7956168 times load_is_not_zero_count: 261472 times 19:51:42 up 2 min, 1 user, load average: 0.03, 0.05, 0.01 load_is_zero_count: 10199896 times load_is_not_zero_count: 278364 times 19:52:51 up 3 min, 1 user, load average: 0.06, 0.05, 0.01 load_is_zero_count: 14333700 times load_is_not_zero_count: 318424 times 19:54:53 up 5 min, 1 user, load average: 0.01, 0.03, 0.00 Perhaps we can gain some performance advantage by saving these unnecessary calculation. Signed-off-by: Peng Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot < vincent.guittot@linaro.org> Link: https://lkml.kernel.org/r/1576208740-35609-1-git-send-email-rocking@linux.alibaba.com --- kernel/sched/pelt.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index a96db50d40e0..bd006b79b360 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -129,8 +129,20 @@ accumulate_sum(u64 delta, struct sched_avg *sa, * Step 2 */ delta %= 1024; - contrib = __accumulate_pelt_segments(periods, - 1024 - sa->period_contrib, delta); + if (load) { + /* + * This relies on the: + * + * if (!load) + * runnable = running = 0; + * + * clause from ___update_load_sum(); this results in + * the below usage of @contrib to dissapear entirely, + * so no point in calculating it. + */ + contrib = __accumulate_pelt_segments(periods, + 1024 - sa->period_contrib, delta); + } } sa->period_contrib = delta; @@ -205,7 +217,9 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * This means that weight will be 0 but not running for a sched_entity * but also for a cfs_rq if the latter becomes idle. As an example, * this happens during idle_balance() which calls - * update_blocked_averages() + * update_blocked_averages(). + * + * Also see the comment in accumulate_sum(). */ if (!load) runnable = running = 0; -- cgit v1.2.3 From a5e37de90e67ac1072a9a44bd0cec9f5e98ded08 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 14 Dec 2019 19:51:07 +0000 Subject: stop_machine: remove try_stop_cpus helper try_stop_cpus is not used after this: commit c190c3b16c0f ("rcu: Switch synchronize_sched_expedited() to stop_one_cpu()") So remove it. Signed-off-by: Yangtao Li Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191214195107.26480-1-tiny.windzz@gmail.com --- kernel/stop_machine.c | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 1fe34a9fabc2..5d68ec4c4015 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -453,36 +453,6 @@ int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) return ret; } -/** - * try_stop_cpus - try to stop multiple cpus - * @cpumask: cpus to stop - * @fn: function to execute - * @arg: argument to @fn - * - * Identical to stop_cpus() except that it fails with -EAGAIN if - * someone else is already using the facility. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * -EAGAIN if someone else is already stopping cpus, -ENOENT if - * @fn(@arg) was not executed at all because all cpus in @cpumask were - * offline; otherwise, 0 if all executions of @fn returned 0, any non - * zero return value if any returned non zero. - */ -int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) -{ - int ret; - - /* static works are used, process one request at a time */ - if (!mutex_trylock(&stop_cpus_mutex)) - return -EAGAIN; - ret = __stop_cpus(cpumask, fn, arg); - mutex_unlock(&stop_cpus_mutex); - return ret; -} - static int cpu_stop_should_run(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); -- cgit v1.2.3 From 2d602bf28316e2f61a553f13d279f3d74c2e5189 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 24 Oct 2019 16:34:25 +0200 Subject: acct: stop using get_seconds() In 'struct acct', 'struct acct_v3', and 'struct taskstats' we have a 32-bit 'ac_btime' field containing an absolute time value, which will overflow in year 2106. There are two possible ways to deal with it: a) let it overflow and have user space code deal with reconstructing the data based on the current time, or b) truncate the times based on the range of the u32 type. Neither of them solves the actual problem. Pick the second one to best document what the issue is, and have someone fix it in a future version. Signed-off-by: Arnd Bergmann --- kernel/acct.c | 4 +++- kernel/tsacct.c | 8 +++++--- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 81f9831a7859..11ff4a596d6b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -416,6 +416,7 @@ static void fill_ac(acct_t *ac) { struct pacct_struct *pacct = ¤t->signal->pacct; u64 elapsed, run_time; + time64_t btime; struct tty_struct *tty; /* @@ -448,7 +449,8 @@ static void fill_ac(acct_t *ac) } #endif do_div(elapsed, AHZ); - ac->ac_btime = get_seconds() - elapsed; + btime = ktime_get_real_seconds() - elapsed; + ac->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX); #if ACCT_VERSION==2 ac->ac_ahz = AHZ; #endif diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 7be3e7530841..ab12616ee6fb 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -24,6 +24,7 @@ void bacct_add_tsk(struct user_namespace *user_ns, const struct cred *tcred; u64 utime, stime, utimescaled, stimescaled; u64 delta; + time64_t btime; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); @@ -32,9 +33,10 @@ void bacct_add_tsk(struct user_namespace *user_ns, /* Convert to micro seconds */ do_div(delta, NSEC_PER_USEC); stats->ac_etime = delta; - /* Convert to seconds for btime */ - do_div(delta, USEC_PER_SEC); - stats->ac_btime = get_seconds() - delta; + /* Convert to seconds for btime (note y2106 limit) */ + btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC); + stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX); + if (thread_group_leader(tsk)) { stats->ac_exitcode = tsk->exit_code; if (tsk->flags & PF_FORKNOEXEC) -- cgit v1.2.3 From 352c912b0a525977a8e6fa1f87c15d9f71943642 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 24 Oct 2019 16:47:56 +0200 Subject: tsacct: add 64-bit btime field As there is only a 32-bit ac_btime field in taskstat and we should handle dates after the overflow, add a new field with the same information but 64-bit width that can hold a full time64_t. Signed-off-by: Arnd Bergmann --- kernel/tsacct.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index ab12616ee6fb..257ffb993ea2 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -36,6 +36,7 @@ void bacct_add_tsk(struct user_namespace *user_ns, /* Convert to seconds for btime (note y2106 limit) */ btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC); stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX); + stats->ac_btime64 = btime; if (thread_group_leader(tsk)) { stats->ac_exitcode = tsk->exit_code; -- cgit v1.2.3 From 751addac78b6f205ffd47c8736ca6d429dc77703 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 24 Oct 2019 22:53:19 +0200 Subject: y2038: remove obsolete jiffies conversion functions Now that the last user of timespec_to_jiffies() is gone, these can just be removed, everything else is using ktime_t or timespec64 already. Signed-off-by: Arnd Bergmann --- kernel/time/time.c | 58 +++++------------------------------------------------- 1 file changed, 5 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 704ccd9451b0..cdd7386115ff 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -626,10 +626,12 @@ EXPORT_SYMBOL(__usecs_to_jiffies); * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec * value to a scaled second value. */ -static unsigned long -__timespec64_to_jiffies(u64 sec, long nsec) + +unsigned long +timespec64_to_jiffies(const struct timespec64 *value) { - nsec = nsec + TICK_NSEC - 1; + u64 sec = value->tv_sec; + long nsec = value->tv_nsec + TICK_NSEC - 1; if (sec >= MAX_SEC_IN_JIFFIES){ sec = MAX_SEC_IN_JIFFIES; @@ -640,18 +642,6 @@ __timespec64_to_jiffies(u64 sec, long nsec) (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; } - -static unsigned long -__timespec_to_jiffies(unsigned long sec, long nsec) -{ - return __timespec64_to_jiffies((u64)sec, nsec); -} - -unsigned long -timespec64_to_jiffies(const struct timespec64 *value) -{ - return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec); -} EXPORT_SYMBOL(timespec64_to_jiffies); void @@ -668,44 +658,6 @@ jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value) } EXPORT_SYMBOL(jiffies_to_timespec64); -/* - * We could use a similar algorithm to timespec_to_jiffies (with a - * different multiplier for usec instead of nsec). But this has a - * problem with rounding: we can't exactly add TICK_NSEC - 1 to the - * usec value, since it's not necessarily integral. - * - * We could instead round in the intermediate scaled representation - * (i.e. in units of 1/2^(large scale) jiffies) but that's also - * perilous: the scaling introduces a small positive error, which - * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1 - * units to the intermediate before shifting) leads to accidental - * overflow and overestimates. - * - * At the cost of one additional multiplication by a constant, just - * use the timespec implementation. - */ -unsigned long -timeval_to_jiffies(const struct timeval *value) -{ - return __timespec_to_jiffies(value->tv_sec, - value->tv_usec * NSEC_PER_USEC); -} -EXPORT_SYMBOL(timeval_to_jiffies); - -void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u32 rem; - - value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, - NSEC_PER_SEC, &rem); - value->tv_usec = rem / NSEC_PER_USEC; -} -EXPORT_SYMBOL(jiffies_to_timeval); - /* * Convert jiffies/jiffies_64 to clock_t and back. */ -- cgit v1.2.3 From 4f9fbd893fe83a1193adceca41c8f7aa6c7382a1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 Nov 2019 15:53:29 +0100 Subject: y2038: rename itimerval to __kernel_old_itimerval Take the renaming of timeval and timespec one level further, also renaming itimerval to __kernel_old_itimerval, to avoid namespace conflicts with the user-space structure that may use 64-bit time_t members. Signed-off-by: Arnd Bergmann --- kernel/time/itimer.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 9e59c9ea92aa..ca4e6d57d68b 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -97,20 +97,20 @@ static int do_getitimer(int which, struct itimerspec64 *value) return 0; } -static int put_itimerval(struct itimerval __user *o, +static int put_itimerval(struct __kernel_old_itimerval __user *o, const struct itimerspec64 *i) { - struct itimerval v; + struct __kernel_old_itimerval v; v.it_interval.tv_sec = i->it_interval.tv_sec; v.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC; v.it_value.tv_sec = i->it_value.tv_sec; v.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC; - return copy_to_user(o, &v, sizeof(struct itimerval)) ? -EFAULT : 0; + return copy_to_user(o, &v, sizeof(struct __kernel_old_itimerval)) ? -EFAULT : 0; } -SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) +SYSCALL_DEFINE2(getitimer, int, which, struct __kernel_old_itimerval __user *, value) { struct itimerspec64 get_buffer; int error = do_getitimer(which, &get_buffer); @@ -314,11 +314,11 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds) #endif -static int get_itimerval(struct itimerspec64 *o, const struct itimerval __user *i) +static int get_itimerval(struct itimerspec64 *o, const struct __kernel_old_itimerval __user *i) { - struct itimerval v; + struct __kernel_old_itimerval v; - if (copy_from_user(&v, i, sizeof(struct itimerval))) + if (copy_from_user(&v, i, sizeof(struct __kernel_old_itimerval))) return -EFAULT; /* Validate the timevals in value. */ @@ -333,8 +333,8 @@ static int get_itimerval(struct itimerspec64 *o, const struct itimerval __user * return 0; } -SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, - struct itimerval __user *, ovalue) +SYSCALL_DEFINE3(setitimer, int, which, struct __kernel_old_itimerval __user *, value, + struct __kernel_old_itimerval __user *, ovalue) { struct itimerspec64 set_buffer, get_buffer; int error; -- cgit v1.2.3 From 5bf2fc1f9c88397b125d5ec5f65b1ed9300ba59d Mon Sep 17 00:00:00 2001 From: Aditya Pakki Date: Thu, 19 Dec 2019 11:57:35 -0600 Subject: bpf: Remove unnecessary assertion on fp_old The two callers of bpf_prog_realloc - bpf_patch_insn_single and bpf_migrate_filter dereference the struct fp_old, before passing it to the function. Thus assertion to check fp_old is unnecessary and can be removed. Signed-off-by: Aditya Pakki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191219175735.19231-1-pakki001@umn.edu --- kernel/bpf/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2ff01a716128..7622dfc36705 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -222,8 +222,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, u32 pages, delta; int ret; - BUG_ON(fp_old == NULL); - size = round_up(size, PAGE_SIZE); pages = size / PAGE_SIZE; if (pages <= fp_old->pages) -- cgit v1.2.3 From 0536b85239b8440735cdd910aae0eb076ebbb439 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 19 Dec 2019 07:09:59 +0100 Subject: xdp: Simplify devmap cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the RCU flavor consolidation [1], call_rcu() and synchronize_rcu() waits for preempt-disable regions (NAPI) in addition to the read-side critical sections. As a result of this, the cleanup code in devmap can be simplified * There is no longer a need to flush in __dev_map_entry_free, since we know that this has been done when the call_rcu() callback is triggered. * When freeing the map, there is no need to explicitly wait for a flush. It's guaranteed to be done after the synchronize_rcu() call in dev_map_free(). The rcu_barrier() is still needed, so that the map is not freed prior the elements. [1] https://lwn.net/Articles/777036/ Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-2-bjorn.topel@gmail.com --- kernel/bpf/devmap.c | 43 +++++-------------------------------------- 1 file changed, 5 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 3d3d61b5985b..b7595de6a91a 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -201,7 +201,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - int i, cpu; + int i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were @@ -221,18 +221,6 @@ static void dev_map_free(struct bpf_map *map) /* Make sure prior __dev_map_entry_free() have completed. */ rcu_barrier(); - /* To ensure all pending flush operations have completed wait for flush - * list to empty on _all_ cpus. - * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new items will be added. - */ - for_each_online_cpu(cpu) { - struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); - - while (!list_empty(flush_list)) - cond_resched(); - } - if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { for (i = 0; i < dtab->n_buckets; i++) { struct bpf_dtab_netdev *dev; @@ -345,8 +333,7 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } -static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, - bool in_napi_ctx) +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) { struct bpf_dtab_netdev *obj = bq->obj; struct net_device *dev = obj->dev; @@ -384,11 +371,7 @@ error: for (i = 0; i < bq->count; i++) { struct xdp_frame *xdpf = bq->q[i]; - /* RX path under NAPI protection, can return frames faster */ - if (likely(in_napi_ctx)) - xdp_return_frame_rx_napi(xdpf); - else - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); drops++; } goto out; @@ -409,7 +392,7 @@ void __dev_map_flush(struct bpf_map *map) rcu_read_lock(); list_for_each_entry_safe(bq, tmp, flush_list, flush_node) - bq_xmit_all(bq, XDP_XMIT_FLUSH, true); + bq_xmit_all(bq, XDP_XMIT_FLUSH); rcu_read_unlock(); } @@ -440,7 +423,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(bq, 0, true); + bq_xmit_all(bq, 0); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -509,27 +492,11 @@ static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) return dev ? &dev->ifindex : NULL; } -static void dev_map_flush_old(struct bpf_dtab_netdev *dev) -{ - if (dev->dev->netdev_ops->ndo_xdp_xmit) { - struct xdp_bulk_queue *bq; - int cpu; - - rcu_read_lock(); - for_each_online_cpu(cpu) { - bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(bq, XDP_XMIT_FLUSH, false); - } - rcu_read_unlock(); - } -} - static void __dev_map_entry_free(struct rcu_head *rcu) { struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); - dev_map_flush_old(dev); free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); -- cgit v1.2.3 From 4bc188c7f23a5a308d7f15dda1b6a286d74e8954 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 19 Dec 2019 07:10:00 +0100 Subject: xdp: Simplify cpumap cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the RCU flavor consolidation [1], call_rcu() and synchronize_rcu() waits for preempt-disable regions (NAPI) in addition to the read-side critical sections. As a result of this, the cleanup code in cpumap can be simplified * There is no longer a need to flush in __cpu_map_entry_free, since we know that this has been done when the call_rcu() callback is triggered. * When freeing the map, there is no need to explicitly wait for a flush. It's guaranteed to be done after the synchronize_rcu() call in cpu_map_free(). [1] https://lwn.net/Articles/777036/ Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-3-bjorn.topel@gmail.com --- kernel/bpf/cpumap.c | 34 +++++----------------------------- 1 file changed, 5 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ef49e17ae47c..04c8eb11cd90 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -75,7 +75,7 @@ struct bpf_cpu_map { struct list_head __percpu *flush_list; }; -static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx); +static int bq_flush_to_queue(struct xdp_bulk_queue *bq); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { @@ -399,7 +399,6 @@ free_rcu: static void __cpu_map_entry_free(struct rcu_head *rcu) { struct bpf_cpu_map_entry *rcpu; - int cpu; /* This cpu_map_entry have been disconnected from map and one * RCU graze-period have elapsed. Thus, XDP cannot queue any @@ -408,13 +407,6 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) */ rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); - /* Flush remaining packets in percpu bulkq */ - for_each_online_cpu(cpu) { - struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); - - /* No concurrent bq_enqueue can run at this point */ - bq_flush_to_queue(bq, false); - } free_percpu(rcpu->bulkq); /* Cannot kthread_stop() here, last put free rcpu resources */ put_cpu_map_entry(rcpu); @@ -507,7 +499,6 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, static void cpu_map_free(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - int cpu; u32 i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, @@ -522,18 +513,6 @@ static void cpu_map_free(struct bpf_map *map) bpf_clear_redirect_map(map); synchronize_rcu(); - /* To ensure all pending flush operations have completed wait for flush - * list be empty on _all_ cpus. Because the above synchronize_rcu() - * ensures the map is disconnected from the program we can assume no new - * items will be added to the list. - */ - for_each_online_cpu(cpu) { - struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu); - - while (!list_empty(flush_list)) - cond_resched(); - } - /* For cpu_map the remote CPUs can still be using the entries * (struct bpf_cpu_map_entry). */ @@ -599,7 +578,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, }; -static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) +static int bq_flush_to_queue(struct xdp_bulk_queue *bq) { struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; @@ -620,10 +599,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - if (likely(in_napi_ctx)) - xdp_return_frame_rx_napi(xdpf); - else - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); } processed++; } @@ -646,7 +622,7 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) - bq_flush_to_queue(bq, true); + bq_flush_to_queue(bq); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver @@ -688,7 +664,7 @@ void __cpu_map_flush(struct bpf_map *map) struct xdp_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { - bq_flush_to_queue(bq, true); + bq_flush_to_queue(bq); /* If already running, costs spin_lock_irqsave + smb_mb */ wake_up_process(bq->obj->kthread); -- cgit v1.2.3 From fb5aacdf3603ccbafe1da74eecd132eb4a31e53f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 19 Dec 2019 07:10:01 +0100 Subject: xdp: Fix graze->grace type-o in cpumap comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simple spelling fix. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-4-bjorn.topel@gmail.com --- kernel/bpf/cpumap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 04c8eb11cd90..f9deed659798 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -401,7 +401,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) struct bpf_cpu_map_entry *rcpu; /* This cpu_map_entry have been disconnected from map and one - * RCU graze-period have elapsed. Thus, XDP cannot queue any + * RCU grace-period have elapsed. Thus, XDP cannot queue any * new packets and cannot change/set flush_needed that can * find this entry. */ @@ -428,7 +428,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) * percpu bulkq to queue. Due to caller map_delete_elem() disable * preemption, cannot call kthread_stop() to make sure queue is empty. * Instead a work_queue is started for stopping kthread, - * cpu_map_kthread_stop, which waits for an RCU graze period before + * cpu_map_kthread_stop, which waits for an RCU grace period before * stopping kthread, emptying the queue. */ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, @@ -523,7 +523,7 @@ static void cpu_map_free(struct bpf_map *map) if (!rcpu) continue; - /* bq flush and cleanup happens after RCU graze-period */ + /* bq flush and cleanup happens after RCU grace-period */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } free_percpu(cmap->flush_list); -- cgit v1.2.3 From e312b9e706ed6d94f6cc9088fcd9fbd81de4525c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 19 Dec 2019 07:10:02 +0100 Subject: xsk: Make xskmap flush_list common for all map instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xskmap flush list is used to track entries that need to flushed from via the xdp_do_flush_map() function. This list used to be per-map, but there is really no reason for that. Instead make the flush list global for all xskmaps, which simplifies __xsk_map_flush() and xsk_map_alloc(). Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-5-bjorn.topel@gmail.com --- kernel/bpf/xskmap.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 90c4fce1c981..2cc5c8f4c800 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -72,9 +72,9 @@ static void xsk_map_sock_delete(struct xdp_sock *xs, static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { struct bpf_map_memory mem; - int cpu, err, numa_node; + int err, numa_node; struct xsk_map *m; - u64 cost, size; + u64 size; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -86,9 +86,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) numa_node = bpf_map_attr_numa_node(attr); size = struct_size(m, xsk_map, attr->max_entries); - cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus()); - err = bpf_map_charge_init(&mem, cost); + err = bpf_map_charge_init(&mem, size); if (err < 0) return ERR_PTR(err); @@ -102,16 +101,6 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) bpf_map_charge_move(&m->map.memory, &mem); spin_lock_init(&m->lock); - m->flush_list = alloc_percpu(struct list_head); - if (!m->flush_list) { - bpf_map_charge_finish(&m->map.memory); - bpf_map_area_free(m); - return ERR_PTR(-ENOMEM); - } - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); - return &m->map; } @@ -121,7 +110,6 @@ static void xsk_map_free(struct bpf_map *map) bpf_clear_redirect_map(map); synchronize_net(); - free_percpu(m->flush_list); bpf_map_area_free(m); } -- cgit v1.2.3 From 96360004b8628541f5d05a845ea213267db0b1a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 19 Dec 2019 07:10:03 +0100 Subject: xdp: Make devmap flush_list common for all map instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The devmap flush list is used to track entries that need to flushed from via the xdp_do_flush_map() function. This list used to be per-map, but there is really no reason for that. Instead make the flush list global for all devmaps, which simplifies __dev_map_flush() and dev_map_init_map(). Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-6-bjorn.topel@gmail.com --- kernel/bpf/devmap.c | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index b7595de6a91a..da9c832fc5c8 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -75,7 +75,6 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ - struct list_head __percpu *flush_list; struct list_head list; /* these are only used for DEVMAP_HASH type maps */ @@ -85,6 +84,7 @@ struct bpf_dtab { u32 n_buckets; }; +static DEFINE_PER_CPU(struct list_head, dev_map_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); @@ -109,8 +109,8 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { - int err, cpu; - u64 cost; + u64 cost = 0; + int err; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -125,9 +125,6 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) bpf_map_init_from_attr(&dtab->map, attr); - /* make sure page count doesn't overflow */ - cost = (u64) sizeof(struct list_head) * num_possible_cpus(); - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); @@ -143,17 +140,10 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (err) return -EINVAL; - dtab->flush_list = alloc_percpu(struct list_head); - if (!dtab->flush_list) - goto free_charge; - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); if (!dtab->dev_index_head) - goto free_percpu; + goto free_charge; spin_lock_init(&dtab->index_lock); } else { @@ -161,13 +151,11 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_percpu; + goto free_charge; } return 0; -free_percpu: - free_percpu(dtab->flush_list); free_charge: bpf_map_charge_finish(&dtab->map.memory); return -ENOMEM; @@ -254,7 +242,6 @@ static void dev_map_free(struct bpf_map *map) bpf_map_area_free(dtab->netdev_map); } - free_percpu(dtab->flush_list); kfree(dtab); } @@ -384,10 +371,9 @@ error: * net device can be torn down. On devmap tear down we ensure the flush list * is empty before completing to ensure all flush operations have completed. */ -void __dev_map_flush(struct bpf_map *map) +void __dev_map_flush(void) { - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); struct xdp_bulk_queue *bq, *tmp; rcu_read_lock(); @@ -419,7 +405,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct net_device *dev_rx) { - struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) @@ -777,10 +763,15 @@ static struct notifier_block dev_map_notifier = { static int __init dev_map_init(void) { + int cpu; + /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(&per_cpu(dev_map_flush_list, cpu)); return 0; } -- cgit v1.2.3 From cdfafe98cabefeedbbc65af5c191c59745c03298 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 19 Dec 2019 07:10:04 +0100 Subject: xdp: Make cpumap flush_list common for all map instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cpumap flush list is used to track entries that need to flushed from via the xdp_do_flush_map() function. This list used to be per-map, but there is really no reason for that. Instead make the flush list global for all devmaps, which simplifies __cpu_map_flush() and cpu_map_alloc(). Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-7-bjorn.topel@gmail.com --- kernel/bpf/cpumap.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index f9deed659798..70f71b154fa5 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -72,17 +72,18 @@ struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ struct bpf_cpu_map_entry **cpu_map; - struct list_head __percpu *flush_list; }; +static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); + static int bq_flush_to_queue(struct xdp_bulk_queue *bq); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { struct bpf_cpu_map *cmap; int err = -ENOMEM; - int ret, cpu; u64 cost; + int ret; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -106,7 +107,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - cost += sizeof(struct list_head) * num_possible_cpus(); /* Notice returns -EPERM on if map size is larger than memlock limit */ ret = bpf_map_charge_init(&cmap->map.memory, cost); @@ -115,23 +115,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) goto free_cmap; } - cmap->flush_list = alloc_percpu(struct list_head); - if (!cmap->flush_list) - goto free_charge; - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu)); - /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), cmap->map.numa_node); if (!cmap->cpu_map) - goto free_percpu; + goto free_charge; return &cmap->map; -free_percpu: - free_percpu(cmap->flush_list); free_charge: bpf_map_charge_finish(&cmap->map.memory); free_cmap: @@ -526,7 +517,6 @@ static void cpu_map_free(struct bpf_map *map) /* bq flush and cleanup happens after RCU grace-period */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } - free_percpu(cmap->flush_list); bpf_map_area_free(cmap->cpu_map); kfree(cmap); } @@ -618,7 +608,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq) */ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { - struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list); + struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) @@ -657,10 +647,9 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, return 0; } -void __cpu_map_flush(struct bpf_map *map) +void __cpu_map_flush(void) { - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - struct list_head *flush_list = this_cpu_ptr(cmap->flush_list); + struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { @@ -670,3 +659,14 @@ void __cpu_map_flush(struct bpf_map *map) wake_up_process(bq->obj->kthread); } } + +static int __init cpu_map_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu)); + return 0; +} + +subsys_initcall(cpu_map_init); -- cgit v1.2.3 From 1020c1f24a946e7d5d8a67db741b20efcd2cefc5 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 18 Dec 2019 23:44:33 -0800 Subject: bpf: Simplify __cgroup_bpf_attach __cgroup_bpf_attach has a lot of identical code to handle two scenarios: BPF_F_ALLOW_MULTI is set and unset. Simplify it by splitting the two main steps: * First, the decision is made whether a new bpf_prog_list entry should be allocated or existing entry should be reused for the new program. This decision is saved in replace_pl pointer; * Next, replace_pl pointer is used to handle both possible states of BPF_F_ALLOW_MULTI flag (set / unset) instead of doing similar work for them separately. This splitting, in turn, allows to make further simplifications: * The check for attaching same program twice in BPF_F_ALLOW_MULTI mode can be done before allocating cgroup storage, so that if user tries to attach same program twice no alloc/free happens as it was before; * pl_was_allocated becomes redundant so it's removed. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/c6193db6fe630797110b0d3ff06c125d093b834c.1576741281.git.rdna@fb.com --- kernel/bpf/cgroup.c | 62 ++++++++++++++++++++--------------------------------- 1 file changed, 23 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9f90d3c92bda..e8cbdd1be687 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -295,9 +295,8 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; + struct bpf_prog_list *pl, *replace_pl = NULL; enum bpf_cgroup_storage_type stype; - struct bpf_prog_list *pl; - bool pl_was_allocated; int err; if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) @@ -317,6 +316,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; + if (flags & BPF_F_ALLOW_MULTI) { + list_for_each_entry(pl, progs, node) { + if (pl->prog == prog) + /* disallow attaching the same prog twice */ + return -EINVAL; + } + } else if (!list_empty(progs)) { + replace_pl = list_first_entry(progs, typeof(*pl), node); + } + for_each_cgroup_storage_type(stype) { storage[stype] = bpf_cgroup_storage_alloc(prog, stype); if (IS_ERR(storage[stype])) { @@ -327,52 +336,27 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, } } - if (flags & BPF_F_ALLOW_MULTI) { - list_for_each_entry(pl, progs, node) { - if (pl->prog == prog) { - /* disallow attaching the same prog twice */ - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); - return -EINVAL; - } + if (replace_pl) { + pl = replace_pl; + old_prog = pl->prog; + for_each_cgroup_storage_type(stype) { + old_storage[stype] = pl->storage[stype]; + bpf_cgroup_storage_unlink(old_storage[stype]); } - + } else { pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(storage[stype]); return -ENOMEM; } - - pl_was_allocated = true; - pl->prog = prog; - for_each_cgroup_storage_type(stype) - pl->storage[stype] = storage[stype]; list_add_tail(&pl->node, progs); - } else { - if (list_empty(progs)) { - pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) { - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); - return -ENOMEM; - } - pl_was_allocated = true; - list_add_tail(&pl->node, progs); - } else { - pl = list_first_entry(progs, typeof(*pl), node); - old_prog = pl->prog; - for_each_cgroup_storage_type(stype) { - old_storage[stype] = pl->storage[stype]; - bpf_cgroup_storage_unlink(old_storage[stype]); - } - pl_was_allocated = false; - } - pl->prog = prog; - for_each_cgroup_storage_type(stype) - pl->storage[stype] = storage[stype]; } + pl->prog = prog; + for_each_cgroup_storage_type(stype) + pl->storage[stype] = storage[stype]; + cgrp->bpf.flags[type] = flags; err = update_effective_progs(cgrp, type); @@ -401,7 +385,7 @@ cleanup: pl->storage[stype] = old_storage[stype]; bpf_cgroup_storage_link(old_storage[stype], cgrp, type); } - if (pl_was_allocated) { + if (!replace_pl) { list_del(&pl->node); kfree(pl); } -- cgit v1.2.3 From 9fab329d6a04c0a52a84d207b5e0d83aeb660aa0 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 18 Dec 2019 23:44:34 -0800 Subject: bpf: Remove unused new_flags in hierarchy_allows_attach() new_flags is unused, remove it. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/2c49b30ab750f93cfef04a1e40b097d70c3a39a1.1576741281.git.rdna@fb.com --- kernel/bpf/cgroup.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e8cbdd1be687..283efe3ce052 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -103,8 +103,7 @@ static u32 prog_list_length(struct list_head *head) * if parent has overridable or multi-prog, allow attaching */ static bool hierarchy_allows_attach(struct cgroup *cgrp, - enum bpf_attach_type type, - u32 new_flags) + enum bpf_attach_type type) { struct cgroup *p; @@ -303,7 +302,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, /* invalid combination */ return -EINVAL; - if (!hierarchy_allows_attach(cgrp, type, flags)) + if (!hierarchy_allows_attach(cgrp, type)) return -EPERM; if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) -- cgit v1.2.3 From 7dd68b3279f1792103d12e69933db3128c6d416e Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 18 Dec 2019 23:44:35 -0800 Subject: bpf: Support replacing cgroup-bpf program in MULTI mode The common use-case in production is to have multiple cgroup-bpf programs per attach type that cover multiple use-cases. Such programs are attached with BPF_F_ALLOW_MULTI and can be maintained by different people. Order of programs usually matters, for example imagine two egress programs: the first one drops packets and the second one counts packets. If they're swapped the result of counting program will be different. It brings operational challenges with updating cgroup-bpf program(s) attached with BPF_F_ALLOW_MULTI since there is no way to replace a program: * One way to update is to detach all programs first and then attach the new version(s) again in the right order. This introduces an interruption in the work a program is doing and may not be acceptable (e.g. if it's egress firewall); * Another way is attach the new version of a program first and only then detach the old version. This introduces the time interval when two versions of same program are working, what may not be acceptable if a program is not idempotent. It also imposes additional burden on program developers to make sure that two versions of their program can co-exist. Solve the problem by introducing a "replace" mode in BPF_PROG_ATTACH command for cgroup-bpf programs being attached with BPF_F_ALLOW_MULTI flag. This mode is enabled by newly introduced BPF_F_REPLACE attach flag and bpf_attr.replace_bpf_fd attribute to pass fd of the old program to replace That way user can replace any program among those attached with BPF_F_ALLOW_MULTI flag without the problems described above. Details of the new API: * If BPF_F_REPLACE is set but replace_bpf_fd doesn't have valid descriptor of BPF program, BPF_PROG_ATTACH will return corresponding error (EINVAL or EBADF). * If replace_bpf_fd has valid descriptor of BPF program but such a program is not attached to specified cgroup, BPF_PROG_ATTACH will return ENOENT. BPF_F_REPLACE is introduced to make the user intent clear, since replace_bpf_fd alone can't be used for this (its default value, 0, is a valid fd). BPF_F_REPLACE also makes it possible to extend the API in the future (e.g. add BPF_F_BEFORE and BPF_F_AFTER if needed). Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: Andrii Narkyiko Link: https://lore.kernel.org/bpf/30cd850044a0057bdfcaaf154b7d2f39850ba813.1576741281.git.rdna@fb.com --- kernel/bpf/cgroup.c | 30 ++++++++++++++++++++++++++---- kernel/bpf/syscall.c | 4 ++-- kernel/cgroup/cgroup.c | 5 +++-- 3 files changed, 31 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 283efe3ce052..45346c79613a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -282,14 +282,17 @@ cleanup: * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to attach + * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set * @type: Type of attach operation * @flags: Option flags * * Must be called with cgroup_mutex held. */ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + struct bpf_prog *replace_prog, enum bpf_attach_type type, u32 flags) { + u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], @@ -298,14 +301,15 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_cgroup_storage_type stype; int err; - if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) + if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || + ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) /* invalid combination */ return -EINVAL; if (!hierarchy_allows_attach(cgrp, type)) return -EPERM; - if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) + if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags) /* Disallow attaching non-overridable on top * of existing overridable in this cgroup. * Disallow attaching multi-prog if overridable or none @@ -320,7 +324,12 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (pl->prog == prog) /* disallow attaching the same prog twice */ return -EINVAL; + if (pl->prog == replace_prog) + replace_pl = pl; } + if ((flags & BPF_F_REPLACE) && !replace_pl) + /* prog to replace not found for cgroup */ + return -ENOENT; } else if (!list_empty(progs)) { replace_pl = list_first_entry(progs, typeof(*pl), node); } @@ -356,7 +365,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, for_each_cgroup_storage_type(stype) pl->storage[stype] = storage[stype]; - cgrp->bpf.flags[type] = flags; + cgrp->bpf.flags[type] = saved_flags; err = update_effective_progs(cgrp, type); if (err) @@ -522,6 +531,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog) { + struct bpf_prog *replace_prog = NULL; struct cgroup *cgrp; int ret; @@ -529,8 +539,20 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, if (IS_ERR(cgrp)) return PTR_ERR(cgrp); - ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + if ((attr->attach_flags & BPF_F_ALLOW_MULTI) && + (attr->attach_flags & BPF_F_REPLACE)) { + replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype); + if (IS_ERR(replace_prog)) { + cgroup_put(cgrp); + return PTR_ERR(replace_prog); + } + } + + ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type, attr->attach_flags); + + if (replace_prog) + bpf_prog_put(replace_prog); cgroup_put(cgrp); return ret; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b08c362f4e02..81ee8595dfee 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2073,10 +2073,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, } } -#define BPF_PROG_ATTACH_LAST_FIELD attach_flags +#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd #define BPF_F_ATTACH_MASK \ - (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) + (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE) static int bpf_prog_attach(const union bpf_attr *attr) { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 735af8f15f95..725365df066d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6288,12 +6288,13 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #ifdef CONFIG_CGROUP_BPF int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, u32 flags) + struct bpf_prog *replace_prog, enum bpf_attach_type type, + u32 flags) { int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_attach(cgrp, prog, type, flags); + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags); mutex_unlock(&cgroup_mutex); return ret; } -- cgit v1.2.3 From 2a2ef473cc9111b0d21ce18f49cb34d93d6991ad Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Tue, 10 Dec 2019 18:05:40 +0100 Subject: PM: sleep: Switch to rtc_time64_to_tm()/rtc_tm_to_time64() Call the 64bit versions of rtc_tm time conversion to avoid the y2038 issue. Signed-off-by: Alexandre Belloni Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend_test.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 60564b58de07..e1ed58adb69e 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -70,7 +70,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) static char info_test[] __initdata = KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; - unsigned long now; + time64_t now; struct rtc_wkalrm alm; int status; @@ -81,10 +81,10 @@ repeat: printk(err_readtime, dev_name(&rtc->dev), status); return; } - rtc_tm_to_time(&alm.time, &now); + now = rtc_tm_to_time64(&alm.time); memset(&alm, 0, sizeof alm); - rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); + rtc_time64_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); alm.enabled = true; status = rtc_set_alarm(rtc, &alm); -- cgit v1.2.3 From 66528a4575eee9f5a5270219894ab6178f146e84 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 11 Dec 2019 11:17:11 -0500 Subject: rseq: Reject unknown flags on rseq unregister It is preferrable to reject unknown flags within rseq unregistration rather than to ignore them. It is an oversight caused by the fact that the check for unknown flags is after the rseq unregister flag check. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211161713.4490-2-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/rseq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rseq.c b/kernel/rseq.c index 27c48eb7de40..a4f86a9d6937 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -310,6 +310,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int ret; if (flags & RSEQ_FLAG_UNREGISTER) { + if (flags & ~RSEQ_FLAG_UNREGISTER) + return -EINVAL; /* Unregister rseq for current thread. */ if (current->rseq != rseq || !current->rseq) return -EINVAL; -- cgit v1.2.3 From 53a23364b6b0c679a8ecfc48e74d652f18e3631f Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 19 Dec 2019 09:03:14 -0500 Subject: sched/core: Remove unused variable from set_user_nice() This commit left behind an unused variable: 5443a0be6121 ("sched: Use fair:prio_changed() instead of ad-hoc implementation") left behind an unused variable. kernel/sched/core.c: In function 'set_user_nice': kernel/sched/core.c:4507:16: warning: variable 'delta' set but not used int old_prio, delta; ^~~~~ Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 5443a0be6121 ("sched: Use fair:prio_changed() instead of ad-hoc implementation") Link: https://lkml.kernel.org/r/20191219140314.1252-1-cai@lca.pw Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 15508c202bf5..1f6c094520e0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4504,7 +4504,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) void set_user_nice(struct task_struct *p, long nice) { bool queued, running; - int old_prio, delta; + int old_prio; struct rq_flags rf; struct rq *rq; @@ -4538,7 +4538,6 @@ void set_user_nice(struct task_struct *p, long nice) set_load_weight(p, true); old_prio = p->prio; p->prio = effective_prio(p); - delta = p->prio - old_prio; if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); -- cgit v1.2.3 From 17346452b25b98acfb395d2a82ec2e4ad0cb7a01 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 14 Nov 2019 16:19:27 +0530 Subject: sched/fair: Make sched-idle CPU selection consistent throughout There are instances where we keep searching for an idle CPU despite already having a sched-idle CPU (in find_idlest_group_cpu(), select_idle_smt() and select_idle_cpu() and then there are places where we don't necessarily do that and return a sched-idle CPU as soon as we find one (in select_idle_sibling()). This looks a bit inconsistent and it may be worth having the same policy everywhere. On the other hand, choosing a sched-idle CPU over a idle one shall be beneficial from performance and power point of view as well, as we don't need to get the CPU online from a deep idle state which wastes quite a lot of time and energy and delays the scheduling of the newly woken up task. This patch tries to simplify code around sched-idle CPU selection and make it consistent throughout. Testing is done with the help of rt-app on hikey board (ARM64 octa-core, 2 clusters, 0-3 and 4-7). The cpufreq governor was set to performance to avoid any side affects from CPU frequency. Following are the tests performed: Test 1: 1-cfs-task: A single SCHED_NORMAL task is pinned to CPU5 which runs for 2333 us out of 7777 us (so gives time for the cluster to go in deep idle state). Test 2: 1-cfs-1-idle-task: A single SCHED_NORMAL task is pinned on CPU5 and single SCHED_IDLE task is pinned on CPU6 (to make sure cluster 1 doesn't go in deep idle state). Test 3: 1-cfs-8-idle-task: A single SCHED_NORMAL task is pinned on CPU5 and eight SCHED_IDLE tasks are created which run forever (not pinned anywhere, so they run on all CPUs). Checked with kernelshark that as soon as NORMAL task sleeps, the SCHED_IDLE task starts running on CPU5. And here are the results on mean latency (in us), using the "st" tool. $ st 1-cfs-task/rt-app-cfs_thread-0.log N min max sum mean stddev 642 90 592 197180 307.134 109.906 $ st 1-cfs-1-idle-task/rt-app-cfs_thread-0.log N min max sum mean stddev 642 67 311 113850 177.336 41.4251 $ st 1-cfs-8-idle-task/rt-app-cfs_thread-0.log N min max sum mean stddev 643 29 173 41364 64.3297 13.2344 The mean latency when we need to: - wakeup from deep idle state is 307 us. - wakeup from shallow idle state is 177 us. - preempt a SCHED_IDLE task is 64 us. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/b90cbcce608cef4e02a7bbfe178335f76d201bab.1573728344.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8da0222924cf..1f34fa9732d8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5588,7 +5588,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this unsigned int min_exit_latency = UINT_MAX; u64 latest_idle_timestamp = 0; int least_loaded_cpu = this_cpu; - int shallowest_idle_cpu = -1, si_cpu = -1; + int shallowest_idle_cpu = -1; int i; /* Check if we have any choice: */ @@ -5597,6 +5597,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { + if (sched_idle_cpu(i)) + return i; + if (available_idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -5619,12 +5622,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; } - } else if (shallowest_idle_cpu == -1 && si_cpu == -1) { - if (sched_idle_cpu(i)) { - si_cpu = i; - continue; - } - + } else if (shallowest_idle_cpu == -1) { load = cpu_load(cpu_rq(i)); if (load < min_load) { min_load = load; @@ -5633,11 +5631,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this } } - if (shallowest_idle_cpu != -1) - return shallowest_idle_cpu; - if (si_cpu != -1) - return si_cpu; - return least_loaded_cpu; + return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, @@ -5790,7 +5784,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int */ static int select_idle_smt(struct task_struct *p, int target) { - int cpu, si_cpu = -1; + int cpu; if (!static_branch_likely(&sched_smt_present)) return -1; @@ -5798,13 +5792,11 @@ static int select_idle_smt(struct task_struct *p, int target) for_each_cpu(cpu, cpu_smt_mask(target)) { if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - if (available_idle_cpu(cpu)) + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; - if (si_cpu == -1 && sched_idle_cpu(cpu)) - si_cpu = cpu; } - return si_cpu; + return -1; } #else /* CONFIG_SCHED_SMT */ @@ -5834,7 +5826,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t u64 time, cost; s64 delta; int this = smp_processor_id(); - int cpu, nr = INT_MAX, si_cpu = -1; + int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -5864,11 +5856,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) - return si_cpu; - if (available_idle_cpu(cpu)) + return -1; + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) break; - if (si_cpu == -1 && sched_idle_cpu(cpu)) - si_cpu = cpu; } time = cpu_clock(this) - time; -- cgit v1.2.3 From 59fe675248ffc37d4167e9ec6920a2f3d5ec67bb Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Dec 2019 11:38:47 +0000 Subject: sched/uclamp: Remove uclamp_util() The sole user of uclamp_util(), schedutil_cpu_util(), was made to use uclamp_util_with() instead in commit: af24bde8df20 ("sched/uclamp: Add uclamp support to energy_compute()") From then on, uclamp_util() has remained unused. Being a simple wrapper around uclamp_util_with(), we can get rid of it and win back a few lines. Tested-By: Dietmar Eggemann Suggested-by: Dietmar Eggemann Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-2-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 280a3c735935..d9b24513d71d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2324,21 +2324,12 @@ unsigned int uclamp_util_with(struct rq *rq, unsigned int util, return clamp(util, min_util, max_util); } - -static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) -{ - return uclamp_util_with(rq, util, NULL); -} #else /* CONFIG_UCLAMP_TASK */ static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, struct task_struct *p) { return util; } -static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) -{ - return util; -} #endif /* CONFIG_UCLAMP_TASK */ #ifdef arch_scale_freq_capacity -- cgit v1.2.3 From 686516b55e98edf18c2a02d36aaaa6f4c0f6c39c Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Dec 2019 11:38:48 +0000 Subject: sched/uclamp: Make uclamp util helpers use and return UL values Vincent pointed out recently that the canonical type for utilization values is 'unsigned long'. Internally uclamp uses 'unsigned int' values for cache optimization, but this doesn't have to be exported to its users. Make the uclamp helpers that deal with utilization use and return unsigned long values. Tested-By: Dietmar Eggemann Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Quentin Perret Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-3-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- kernel/sched/sched.h | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1f6c094520e0..e7b08d52db93 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -919,17 +919,17 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id) return uc_req; } -unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) +unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_se uc_eff; /* Task currently refcounted: use back-annotated (effective) value */ if (p->uclamp[clamp_id].active) - return p->uclamp[clamp_id].value; + return (unsigned long)p->uclamp[clamp_id].value; uc_eff = uclamp_eff_get(p, clamp_id); - return uc_eff.value; + return (unsigned long)uc_eff.value; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d9b24513d71d..b478474ea847 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2300,14 +2300,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef CONFIG_UCLAMP_TASK -unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); static __always_inline -unsigned int uclamp_util_with(struct rq *rq, unsigned int util, - struct task_struct *p) +unsigned long uclamp_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) { - unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); - unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); + unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); if (p) { min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); @@ -2325,8 +2325,8 @@ unsigned int uclamp_util_with(struct rq *rq, unsigned int util, return clamp(util, min_util, max_util); } #else /* CONFIG_UCLAMP_TASK */ -static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, - struct task_struct *p) +static inline unsigned long uclamp_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) { return util; } -- cgit v1.2.3 From d2b58a286e89824900d501db0be1d4f6aed474fc Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Dec 2019 11:38:49 +0000 Subject: sched/uclamp: Rename uclamp_util_with() into uclamp_rq_util_with() The current helper returns (CPU) rq utilization with uclamp restrictions taken into account. A uclamp task utilization helper would be quite helpful, but this requires some renaming. Prepare the code for the introduction of a uclamp_task_util() by renaming the existing uclamp_util_with() to uclamp_rq_util_with(). Tested-By: Dietmar Eggemann Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Quentin Perret Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-4-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 2 +- kernel/sched/sched.h | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 9b8916fd00a2..7fbaee24c824 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -238,7 +238,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, */ util = util_cfs + cpu_util_rt(rq); if (type == FREQUENCY_UTIL) - util = uclamp_util_with(rq, util, p); + util = uclamp_rq_util_with(rq, util, p); dl_util = cpu_util_dl(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b478474ea847..1a88dc8ad11b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2303,8 +2303,8 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); static __always_inline -unsigned long uclamp_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) { unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); @@ -2325,8 +2325,9 @@ unsigned long uclamp_util_with(struct rq *rq, unsigned long util, return clamp(util, min_util, max_util); } #else /* CONFIG_UCLAMP_TASK */ -static inline unsigned long uclamp_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) +static inline +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) { return util; } -- cgit v1.2.3 From a7008c07a568278ed2763436404752a98004c7ff Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Dec 2019 11:38:50 +0000 Subject: sched/fair: Make task_fits_capacity() consider uclamp restrictions task_fits_capacity() drives CPU selection at wakeup time, and is also used to detect misfit tasks. Right now it does so by comparing task_util_est() with a CPU's capacity, but doesn't take into account uclamp restrictions. There's a few interesting uses that can come out of doing this. For instance, a low uclamp.max value could prevent certain tasks from being flagged as misfit tasks, so they could merrily remain on low-capacity CPUs. Similarly, a high uclamp.min value would steer tasks towards high capacity CPUs at wakeup (and, should that fail, later steered via misfit balancing), so such "boosted" tasks would favor CPUs of higher capacity. Introduce uclamp_task_util() and make task_fits_capacity() use it. Tested-By: Dietmar Eggemann Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Quentin Perret Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-5-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1f34fa9732d8..26c59bc5b2ed 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3711,6 +3711,20 @@ static inline unsigned long task_util_est(struct task_struct *p) return max(task_util(p), _task_util_est(p)); } +#ifdef CONFIG_UCLAMP_TASK +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return clamp(task_util_est(p), + uclamp_eff_value(p, UCLAMP_MIN), + uclamp_eff_value(p, UCLAMP_MAX)); +} +#else +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return task_util_est(p); +} +#endif + static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) { @@ -3822,7 +3836,7 @@ done: static inline int task_fits_capacity(struct task_struct *p, long capacity) { - return fits_capacity(task_util_est(p), capacity); + return fits_capacity(uclamp_task_util(p), capacity); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) -- cgit v1.2.3 From 1d42509e475cdc8542aa5b3e03a7e845244f4f57 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Dec 2019 11:38:51 +0000 Subject: sched/fair: Make EAS wakeup placement consider uclamp restrictions task_fits_capacity() has just been made uclamp-aware, and find_energy_efficient_cpu() needs to go through the same treatment. Things are somewhat different here however - using the task max clamp isn't sufficient. Consider the following setup: The target runqueue, rq: rq.cpu_capacity_orig = 512 rq.cfs.avg.util_avg = 200 rq.uclamp.max = 768 // the max p.uclamp.max of all enqueued p's is 768 The waking task, p (not yet enqueued on rq): p.util_est = 600 p.uclamp.max = 100 Now, consider the following code which doesn't use the rq clamps: util = uclamp_task_util(p); // Does the task fit in the spare CPU capacity? cpu = cpu_of(rq); fits_capacity(util, cpu_capacity(cpu) - cpu_util(cpu)) This would lead to: util = 100; fits_capacity(100, 512 - 200) fits_capacity() would return true. However, enqueuing p on that CPU *will* cause it to become overutilized since rq clamp values are max-aggregated, so we'd remain with rq.uclamp.max = 768 which comes from the other tasks already enqueued on rq. Thus, we could select a high enough frequency to reach beyond 0.8 * 512 utilization (== overutilized) after enqueuing p on rq. What find_energy_efficient_cpu() needs here is uclamp_rq_util_with() which lets us peek at the future utilization landscape, including rq-wide uclamp values. Make find_energy_efficient_cpu() use uclamp_rq_util_with() for its fits_capacity() check. This is in line with what compute_energy() ends up using for estimating utilization. Tested-By: Dietmar Eggemann Suggested-by: Quentin Perret Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-6-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 26c59bc5b2ed..2d170b5da0e3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6273,9 +6273,18 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - /* Skip CPUs that will be overutilized. */ util = cpu_util_next(cpu, p, cpu); cpu_cap = capacity_of(cpu); + spare_cap = cpu_cap - util; + + /* + * Skip CPUs that cannot satisfy the capacity request. + * IOW, placing the task there would make the CPU + * overutilized. Take uclamp into account to see how + * much capacity we can get out of the CPU; this is + * aligned with schedutil_cpu_util(). + */ + util = uclamp_rq_util_with(cpu_rq(cpu), util, p); if (!fits_capacity(util, cpu_cap)) continue; @@ -6290,7 +6299,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * Find the CPU with the maximum spare capacity in * the performance domain */ - spare_cap = cpu_cap - util; if (spare_cap > max_spare_cap) { max_spare_cap = spare_cap; max_spare_cap_cpu = cpu; -- cgit v1.2.3 From 804d402fb6f6487b825aae8cf42fda6426c62867 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Wed, 9 Oct 2019 11:46:11 +0100 Subject: sched/rt: Make RT capacity-aware Capacity Awareness refers to the fact that on heterogeneous systems (like Arm big.LITTLE), the capacity of the CPUs is not uniform, hence when placing tasks we need to be aware of this difference of CPU capacities. In such scenarios we want to ensure that the selected CPU has enough capacity to meet the requirement of the running task. Enough capacity means here that capacity_orig_of(cpu) >= task.requirement. The definition of task.requirement is dependent on the scheduling class. For CFS, utilization is used to select a CPU that has >= capacity value than the cfs_task.util. capacity_orig_of(cpu) >= cfs_task.util DL isn't capacity aware at the moment but can make use of the bandwidth reservation to implement that in a similar manner CFS uses utilization. The following patchset implements that: https://lore.kernel.org/lkml/20190506044836.2914-1-luca.abeni@santannapisa.it/ capacity_orig_of(cpu)/SCHED_CAPACITY >= dl_deadline/dl_runtime For RT we don't have a per task utilization signal and we lack any information in general about what performance requirement the RT task needs. But with the introduction of uclamp, RT tasks can now control that by setting uclamp_min to guarantee a minimum performance point. ATM the uclamp value are only used for frequency selection; but on heterogeneous systems this is not enough and we need to ensure that the capacity of the CPU is >= uclamp_min. Which is what implemented here. capacity_orig_of(cpu) >= rt_task.uclamp_min Note that by default uclamp.min is 1024, which means that RT tasks will always be biased towards the big CPUs, which make for a better more predictable behavior for the default case. Must stress that the bias acts as a hint rather than a definite placement strategy. For example, if all big cores are busy executing other RT tasks we can't guarantee that a new RT task will be placed there. On non-heterogeneous systems the original behavior of RT should be retained. Similarly if uclamp is not selected in the config. [ mingo: Minor edits to comments. ] Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Steven Rostedt (VMware) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191009104611.15363-1-qais.yousef@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/cpupri.c | 25 ++++++++++++++-- kernel/sched/cpupri.h | 4 ++- kernel/sched/rt.c | 83 +++++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 94 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index b7abca987d94..1a2719e1350a 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -46,6 +46,8 @@ static int convert_prio(int prio) * @cp: The cpupri context * @p: The task * @lowest_mask: A mask to fill in with selected CPUs (or NULL) + * @fitness_fn: A pointer to a function to do custom checks whether the CPU + * fits a specific criteria so that we only return those CPUs. * * Note: This function returns the recommended CPUs as calculated during the * current invocation. By the time the call returns, the CPUs may have in @@ -57,7 +59,8 @@ static int convert_prio(int prio) * Return: (int)bool - CPUs were found */ int cpupri_find(struct cpupri *cp, struct task_struct *p, - struct cpumask *lowest_mask) + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)) { int idx = 0; int task_pri = convert_prio(p->prio); @@ -98,6 +101,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, continue; if (lowest_mask) { + int cpu; + cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); /* @@ -108,7 +113,23 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, * condition, simply act as though we never hit this * priority level and continue on. */ - if (cpumask_any(lowest_mask) >= nr_cpu_ids) + if (cpumask_empty(lowest_mask)) + continue; + + if (!fitness_fn) + return 1; + + /* Ensure the capacity of the CPUs fit the task */ + for_each_cpu(cpu, lowest_mask) { + if (!fitness_fn(p, cpu)) + cpumask_clear_cpu(cpu, lowest_mask); + } + + /* + * If no CPU at the current priority can fit the task + * continue looking + */ + if (cpumask_empty(lowest_mask)) continue; } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 7dc20a3232e7..32dd520db11f 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -18,7 +18,9 @@ struct cpupri { }; #ifdef CONFIG_SMP -int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); +int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)); void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e591d40fd645..4043abe45459 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -437,6 +437,45 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se) return rt_se->on_rq; } +#ifdef CONFIG_UCLAMP_TASK +/* + * Verify the fitness of task @p to run on @cpu taking into account the uclamp + * settings. + * + * This check is only important for heterogeneous systems where uclamp_min value + * is higher than the capacity of a @cpu. For non-heterogeneous system this + * function will always return true. + * + * The function will return true if the capacity of the @cpu is >= the + * uclamp_min and false otherwise. + * + * Note that uclamp_min will be clamped to uclamp_max if uclamp_min + * > uclamp_max. + */ +static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned int min_cap; + unsigned int max_cap; + unsigned int cpu_cap; + + /* Only heterogeneous systems can benefit from this check */ + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return true; + + min_cap = uclamp_eff_value(p, UCLAMP_MIN); + max_cap = uclamp_eff_value(p, UCLAMP_MAX); + + cpu_cap = capacity_orig_of(cpu); + + return cpu_cap >= min(min_cap, max_cap); +} +#else +static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) +{ + return true; +} +#endif + #ifdef CONFIG_RT_GROUP_SCHED static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) @@ -1391,6 +1430,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; struct rq *rq; + bool test; /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) @@ -1422,10 +1462,16 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) * * This test is optimistic, if we get it wrong the load-balancer * will have to sort it out. + * + * We take into account the capacity of the CPU to ensure it fits the + * requirement of the task - which is only important on heterogeneous + * systems like big.LITTLE. */ - if (curr && unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio)) { + test = curr && + unlikely(rt_task(curr)) && + (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); + + if (test || !rt_task_fits_capacity(p, cpu)) { int target = find_lowest_rq(p); /* @@ -1449,15 +1495,15 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL)) return; /* * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (p->nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, NULL)) + if (p->nr_cpus_allowed != 1 && + cpupri_find(&rq->rd->cpupri, p, NULL, NULL)) return; /* @@ -1601,7 +1647,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) + cpumask_test_cpu(cpu, p->cpus_ptr) && + rt_task_fits_capacity(p, cpu)) return 1; return 0; @@ -1643,7 +1690,8 @@ static int find_lowest_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ - if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) + if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask, + rt_task_fits_capacity)) return -1; /* No targets found */ /* @@ -2147,12 +2195,14 @@ skip: */ static void task_woken_rt(struct rq *rq, struct task_struct *p) { - if (!task_running(rq, p) && - !test_tsk_need_resched(rq->curr) && - p->nr_cpus_allowed > 1 && - (dl_task(rq->curr) || rt_task(rq->curr)) && - (rq->curr->nr_cpus_allowed < 2 || - rq->curr->prio <= p->prio)) + bool need_to_push = !task_running(rq, p) && + !test_tsk_need_resched(rq->curr) && + p->nr_cpus_allowed > 1 && + (dl_task(rq->curr) || rt_task(rq->curr)) && + (rq->curr->nr_cpus_allowed < 2 || + rq->curr->prio <= p->prio); + + if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq))) push_rt_tasks(rq); } @@ -2224,7 +2274,10 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) + bool need_to_push = rq->rt.overloaded || + !rt_task_fits_capacity(p, cpu_of(rq)); + + if (p->nr_cpus_allowed > 1 && need_to_push) rt_queue_push_tasks(rq); #endif /* CONFIG_SMP */ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) -- cgit v1.2.3 From d91f3057263ceb691ef527e71b41a56b17f6c869 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 20 Dec 2019 08:51:28 -0500 Subject: locking/lockdep: Fix buffer overrun problem in stack_trace[] If the lockdep code is really running out of the stack_trace entries, it is likely that buffer overrun can happen and the data immediately after stack_trace[] will be corrupted. If there is less than LOCK_TRACE_SIZE_IN_LONGS entries left before the call to save_trace(), the max_entries computation will leave it with a very large positive number because of its unsigned nature. The subsequent call to stack_trace_save() will then corrupt the data after stack_trace[]. Fix that by changing max_entries to a signed integer and check for negative value before calling stack_trace_save(). Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Bart Van Assche Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 12593b7467f9 ("locking/lockdep: Reduce space occupied by stack traces") Link: https://lkml.kernel.org/r/20191220135128.14876-1-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 32282e7112d3..32406ef0d6a2 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -482,7 +482,7 @@ static struct lock_trace *save_trace(void) struct lock_trace *trace, *t2; struct hlist_head *hash_head; u32 hash; - unsigned int max_entries; + int max_entries; BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE); BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES); @@ -490,10 +490,8 @@ static struct lock_trace *save_trace(void) trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries); max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries - LOCK_TRACE_SIZE_IN_LONGS; - trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3); - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES - - LOCK_TRACE_SIZE_IN_LONGS - 1) { + if (max_entries <= 0) { if (!debug_locks_off_graph_unlock()) return NULL; @@ -502,6 +500,7 @@ static struct lock_trace *save_trace(void) return NULL; } + trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3); hash = jhash(trace->entries, trace->nr_entries * sizeof(trace->entries[0]), 0); -- cgit v1.2.3 From def97da136515cb289a14729292c193e0a93bc64 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 19 Dec 2019 12:59:22 +0106 Subject: printk: fix exclusive_console replaying Commit f92b070f2dc8 ("printk: Do not miss new messages when replaying the log") introduced a new variable @exclusive_console_stop_seq to store when an exclusive console should stop printing. It should be set to the @console_seq value at registration. However, @console_seq is previously set to @syslog_seq so that the exclusive console knows where to begin. This results in the exclusive console immediately reactivating all the other consoles and thus repeating the messages for those consoles. Set @console_seq after @exclusive_console_stop_seq has stored the current @console_seq value. Fixes: f92b070f2dc8 ("printk: Do not miss new messages when replaying the log") Link: http://lkml.kernel.org/r/20191219115322.31160-1-john.ogness@linutronix.de Cc: Steven Rostedt Cc: linux-kernel@vger.kernel.org Signed-off-by: John Ogness Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1ef6f75d92f1..fada22dc4ab6 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2770,8 +2770,6 @@ void register_console(struct console *newcon) * for us. */ logbuf_lock_irqsave(flags); - console_seq = syslog_seq; - console_idx = syslog_idx; /* * We're about to replay the log buffer. Only do this to the * just-registered console to avoid excessive message spam to @@ -2783,6 +2781,8 @@ void register_console(struct console *newcon) */ exclusive_console = newcon; exclusive_console_stop_seq = console_seq; + console_seq = syslog_seq; + console_idx = syslog_idx; logbuf_unlock_irqrestore(flags); } console_unlock(); -- cgit v1.2.3 From 02f4e01ce710fe20d2e5548d52bfdea52efd09d1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 2 Jan 2020 19:04:57 -0500 Subject: tracing: Initialize val to zero in parse_entry of inject code gcc produces a variable may be uninitialized warning for "val" in parse_entry(). This is really a false positive, but the code is subtle enough to just initialize val to zero and it's not a fast path to worry about it. Marked for stable to remove the warning in the stable trees as well. Cc: stable@vger.kernel.org Fixes: 6c3edaf9fd6a3 ("tracing: Introduce trace event injection") Reported-by: kbuild test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index d45079ee62f8..22bcf7c51d1e 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -195,7 +195,7 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) unsigned long irq_flags; void *entry = NULL; int entry_size; - u64 val; + u64 val = 0; int len; entry = trace_alloc_entry(call, &entry_size); -- cgit v1.2.3 From d2ccbccb5444e9141b33cf5399927737e9ff1c3d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 2 Jan 2020 21:56:44 -0500 Subject: tracing: Define MCOUNT_INSN_SIZE when not defined without direct calls In order to handle direct calls along side of function graph tracer, a check is made to see if the address being traced by the function graph tracer is a direct call or not. To get the address used by direct callers, the return address is subtracted by MCOUNT_INSN_SIZE. For some archs with certain configurations, MCOUNT_INSN_SIZE is undefined here. But these should not be using direct calls anyway. Just define MCOUNT_INSN_SIZE to zero in this case. Link: https://lore.kernel.org/r/202001020219.zvE3vsty%lkp@intel.com Reported-by: kbuild test robot Fixes: ff205766dbbee ("ftrace: Fix function_graph tracer interaction with BPF trampoline") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index a2659735db73..1af321dec0f1 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -96,6 +96,20 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, return 0; } +/* + * Not all archs define MCOUNT_INSN_SIZE which is used to look for direct + * functions. But those archs currently don't support direct functions + * anyway, and ftrace_find_rec_direct() is just a stub for them. + * Define MCOUNT_INSN_SIZE to keep those archs compiling. + */ +#ifndef MCOUNT_INSN_SIZE +/* Make sure this only works without direct calls */ +# ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +# error MCOUNT_INSN_SIZE not defined with direct calls enabled +# endif +# define MCOUNT_INSN_SIZE 0 +#endif + int function_graph_enter(unsigned long ret, unsigned long func, unsigned long frame_pointer, unsigned long *retp) { -- cgit v1.2.3 From b8299d362d0837ae39e87e9019ebe6b736e0f035 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 2 Jan 2020 22:02:41 -0500 Subject: tracing: Have stack tracer compile when MCOUNT_INSN_SIZE is not defined On some archs with some configurations, MCOUNT_INSN_SIZE is not defined, and this makes the stack tracer fail to compile. Just define it to zero in this case. Link: https://lore.kernel.org/r/202001020219.zvE3vsty%lkp@intel.com Cc: stable@vger.kernel.org Fixes: 4df297129f622 ("tracing: Remove most or all of stack tracer stack size from stack_max_size") Reported-by: kbuild test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4df9a209f7ca..c557f42a9397 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -283,6 +283,11 @@ static void check_stack(unsigned long ip, unsigned long *stack) local_irq_restore(flags); } +/* Some archs may not define MCOUNT_INSN_SIZE */ +#ifndef MCOUNT_INSN_SIZE +# define MCOUNT_INSN_SIZE 0 +#endif + static void stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) -- cgit v1.2.3 From e31f7939c1c27faa5d0e3f14519eaf7c89e8a69d Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Fri, 3 Jan 2020 11:02:48 +0800 Subject: ftrace: Avoid potential division by zero in function profiler The ftrace_profile->counter is unsigned long and do_div truncates it to 32 bits, which means it can test non-zero and be truncated to zero for division. Fix this issue by using div64_ul() instead. Link: http://lkml.kernel.org/r/20200103030248.14516-1-wenyang@linux.alibaba.com Cc: stable@vger.kernel.org Fixes: e330b3bcd8319 ("tracing: Show sample std dev in function profiling") Fixes: 34886c8bc590f ("tracing: add average time in function to function profiler") Signed-off-by: Wen Yang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ac99a3500076..9bf1f2cd515e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -526,8 +526,7 @@ static int function_stat_show(struct seq_file *m, void *v) } #ifdef CONFIG_FUNCTION_GRAPH_TRACER - avg = rec->time; - do_div(avg, rec->counter); + avg = div64_ul(rec->time, rec->counter); if (tracing_thresh && (avg < tracing_thresh)) goto out; #endif @@ -553,7 +552,8 @@ static int function_stat_show(struct seq_file *m, void *v) * Divide only 1000 for ns^2 -> us^2 conversion. * trace_print_graph_duration will divide 1000 again. */ - do_div(stddev, rec->counter * (rec->counter - 1) * 1000); + stddev = div64_ul(stddev, + rec->counter * (rec->counter - 1) * 1000); } trace_seq_init(&s); -- cgit v1.2.3 From 50f9ad607ea891a9308e67b81f774c71736d1098 Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Tue, 31 Dec 2019 05:35:30 -0800 Subject: kernel/trace: Fix do not unregister tracepoints when register sched_migrate_task fail In the function, if register_trace_sched_migrate_task() returns error, sched_switch/sched_wakeup_new/sched_wakeup won't unregister. That is why fail_deprobe_sched_switch was added. Link: http://lkml.kernel.org/r/20191231133530.2794-1-pilgrimtao@gmail.com Cc: stable@vger.kernel.org Fixes: 478142c39c8c2 ("tracing: do not grab lock in wakeup latency function tracing") Signed-off-by: Kaitao Cheng Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_sched_wakeup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 5e43b9664eca..617e297f46dc 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -630,7 +630,7 @@ static void start_wakeup_tracer(struct trace_array *tr) if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_migrate_task\n"); - return; + goto fail_deprobe_sched_switch; } wakeup_reset(tr); @@ -648,6 +648,8 @@ static void start_wakeup_tracer(struct trace_array *tr) printk(KERN_ERR "failed to start wakeup tracer\n"); return; +fail_deprobe_sched_switch: + unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); fail_deprobe_wake_new: unregister_trace_sched_wakeup_new(probe_wakeup, NULL); fail_deprobe: -- cgit v1.2.3 From 72879ee0c53e2fc17f443f7b1adcc0d5130cd934 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 21 Dec 2019 15:48:25 +0000 Subject: tracing: Fix indentation issue There is a declaration that is indented one level too deeply, remove the extraneous tab. Link: http://lkml.kernel.org/r/20191221154825.33073-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_seq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 344e4c1aa09c..87de6edafd14 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -381,7 +381,7 @@ int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { - unsigned int save_len = s->seq.len; + unsigned int save_len = s->seq.len; if (s->full) return 0; -- cgit v1.2.3 From e10360f815ca6367357b2c2cfef17fc663e50f7b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 27 Dec 2019 13:50:34 -0800 Subject: bpf: cgroup: prevent out-of-order release of cgroup bpf Before commit 4bfc0bb2c60e ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself") cgroup bpf structures were released with corresponding cgroup structures. It guaranteed the hierarchical order of destruction: children were always first. It preserved attached programs from being released before their propagated copies. But with cgroup auto-detachment there are no such guarantees anymore: cgroup bpf is released as soon as the cgroup is offline and there are no live associated sockets. It means that an attached program can be detached and released, while its propagated copy is still living in the cgroup subtree. This will obviously lead to an use-after-free bug. To reproduce the issue the following script can be used: #!/bin/bash CGROOT=/sys/fs/cgroup mkdir -p ${CGROOT}/A ${CGROOT}/B ${CGROOT}/A/C sleep 1 ./test_cgrp2_attach ${CGROOT}/A egress & A_PID=$! ./test_cgrp2_attach ${CGROOT}/B egress & B_PID=$! echo $$ > ${CGROOT}/A/C/cgroup.procs iperf -s & S_PID=$! iperf -c localhost -t 100 & C_PID=$! sleep 1 echo $$ > ${CGROOT}/B/cgroup.procs echo ${S_PID} > ${CGROOT}/B/cgroup.procs echo ${C_PID} > ${CGROOT}/B/cgroup.procs sleep 1 rmdir ${CGROOT}/A/C rmdir ${CGROOT}/A sleep 1 kill -9 ${S_PID} ${C_PID} ${A_PID} ${B_PID} On the unpatched kernel the following stacktrace can be obtained: [ 33.619799] BUG: unable to handle page fault for address: ffffbdb4801ab002 [ 33.620677] #PF: supervisor read access in kernel mode [ 33.621293] #PF: error_code(0x0000) - not-present page [ 33.622754] Oops: 0000 [#1] SMP NOPTI [ 33.623202] CPU: 0 PID: 601 Comm: iperf Not tainted 5.5.0-rc2+ #23 [ 33.625545] RIP: 0010:__cgroup_bpf_run_filter_skb+0x29f/0x3d0 [ 33.635809] Call Trace: [ 33.636118] ? __cgroup_bpf_run_filter_skb+0x2bf/0x3d0 [ 33.636728] ? __switch_to_asm+0x40/0x70 [ 33.637196] ip_finish_output+0x68/0xa0 [ 33.637654] ip_output+0x76/0xf0 [ 33.638046] ? __ip_finish_output+0x1c0/0x1c0 [ 33.638576] __ip_queue_xmit+0x157/0x410 [ 33.639049] __tcp_transmit_skb+0x535/0xaf0 [ 33.639557] tcp_write_xmit+0x378/0x1190 [ 33.640049] ? _copy_from_iter_full+0x8d/0x260 [ 33.640592] tcp_sendmsg_locked+0x2a2/0xdc0 [ 33.641098] ? sock_has_perm+0x10/0xa0 [ 33.641574] tcp_sendmsg+0x28/0x40 [ 33.641985] sock_sendmsg+0x57/0x60 [ 33.642411] sock_write_iter+0x97/0x100 [ 33.642876] new_sync_write+0x1b6/0x1d0 [ 33.643339] vfs_write+0xb6/0x1a0 [ 33.643752] ksys_write+0xa7/0xe0 [ 33.644156] do_syscall_64+0x5b/0x1b0 [ 33.644605] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fix this by grabbing a reference to the bpf structure of each ancestor on the initialization of the cgroup bpf structure, and dropping the reference at the end of releasing the cgroup bpf structure. This will restore the hierarchical order of cgroup bpf releasing, without adding any operations on hot paths. Thanks to Josef Bacik for the debugging and the initial analysis of the problem. Fixes: 4bfc0bb2c60e ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself") Reported-by: Josef Bacik Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4fb20ab179fe..9e43b72eb619 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -35,8 +35,8 @@ void cgroup_bpf_offline(struct cgroup *cgrp) */ static void cgroup_bpf_release(struct work_struct *work) { - struct cgroup *cgrp = container_of(work, struct cgroup, - bpf.release_work); + struct cgroup *p, *cgrp = container_of(work, struct cgroup, + bpf.release_work); enum bpf_cgroup_storage_type stype; struct bpf_prog_array *old_array; unsigned int type; @@ -65,6 +65,9 @@ static void cgroup_bpf_release(struct work_struct *work) mutex_unlock(&cgroup_mutex); + for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) + cgroup_bpf_put(p); + percpu_ref_exit(&cgrp->bpf.refcnt); cgroup_put(cgrp); } @@ -199,6 +202,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) */ #define NR ARRAY_SIZE(cgrp->bpf.effective) struct bpf_prog_array *arrays[NR] = {}; + struct cgroup *p; int ret, i; ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, @@ -206,6 +210,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) if (ret) return ret; + for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) + cgroup_bpf_get(p); + for (i = 0; i < NR; i++) INIT_LIST_HEAD(&cgrp->bpf.progs[i]); -- cgit v1.2.3 From 6d4f151acf9a4f6fab09b615f246c717ddedcf0c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 6 Jan 2020 22:51:57 +0100 Subject: bpf: Fix passing modified ctx to ld/abs/ind instruction Anatoly has been fuzzing with kBdysch harness and reported a KASAN slab oob in one of the outcomes: [...] [ 77.359642] BUG: KASAN: slab-out-of-bounds in bpf_skb_load_helper_8_no_cache+0x71/0x130 [ 77.360463] Read of size 4 at addr ffff8880679bac68 by task bpf/406 [ 77.361119] [ 77.361289] CPU: 2 PID: 406 Comm: bpf Not tainted 5.5.0-rc2-xfstests-00157-g2187f215eba #1 [ 77.362134] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 [ 77.362984] Call Trace: [ 77.363249] dump_stack+0x97/0xe0 [ 77.363603] print_address_description.constprop.0+0x1d/0x220 [ 77.364251] ? bpf_skb_load_helper_8_no_cache+0x71/0x130 [ 77.365030] ? bpf_skb_load_helper_8_no_cache+0x71/0x130 [ 77.365860] __kasan_report.cold+0x37/0x7b [ 77.366365] ? bpf_skb_load_helper_8_no_cache+0x71/0x130 [ 77.366940] kasan_report+0xe/0x20 [ 77.367295] bpf_skb_load_helper_8_no_cache+0x71/0x130 [ 77.367821] ? bpf_skb_load_helper_8+0xf0/0xf0 [ 77.368278] ? mark_lock+0xa3/0x9b0 [ 77.368641] ? kvm_sched_clock_read+0x14/0x30 [ 77.369096] ? sched_clock+0x5/0x10 [ 77.369460] ? sched_clock_cpu+0x18/0x110 [ 77.369876] ? bpf_skb_load_helper_8+0xf0/0xf0 [ 77.370330] ___bpf_prog_run+0x16c0/0x28f0 [ 77.370755] __bpf_prog_run32+0x83/0xc0 [ 77.371153] ? __bpf_prog_run64+0xc0/0xc0 [ 77.371568] ? match_held_lock+0x1b/0x230 [ 77.371984] ? rcu_read_lock_held+0xa1/0xb0 [ 77.372416] ? rcu_is_watching+0x34/0x50 [ 77.372826] sk_filter_trim_cap+0x17c/0x4d0 [ 77.373259] ? sock_kzfree_s+0x40/0x40 [ 77.373648] ? __get_filter+0x150/0x150 [ 77.374059] ? skb_copy_datagram_from_iter+0x80/0x280 [ 77.374581] ? do_raw_spin_unlock+0xa5/0x140 [ 77.375025] unix_dgram_sendmsg+0x33a/0xa70 [ 77.375459] ? do_raw_spin_lock+0x1d0/0x1d0 [ 77.375893] ? unix_peer_get+0xa0/0xa0 [ 77.376287] ? __fget_light+0xa4/0xf0 [ 77.376670] __sys_sendto+0x265/0x280 [ 77.377056] ? __ia32_sys_getpeername+0x50/0x50 [ 77.377523] ? lock_downgrade+0x350/0x350 [ 77.377940] ? __sys_setsockopt+0x2a6/0x2c0 [ 77.378374] ? sock_read_iter+0x240/0x240 [ 77.378789] ? __sys_socketpair+0x22a/0x300 [ 77.379221] ? __ia32_sys_socket+0x50/0x50 [ 77.379649] ? mark_held_locks+0x1d/0x90 [ 77.380059] ? trace_hardirqs_on_thunk+0x1a/0x1c [ 77.380536] __x64_sys_sendto+0x74/0x90 [ 77.380938] do_syscall_64+0x68/0x2a0 [ 77.381324] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 77.381878] RIP: 0033:0x44c070 [...] After further debugging, turns out while in case of other helper functions we disallow passing modified ctx, the special case of ld/abs/ind instruction which has similar semantics (except r6 being the ctx argument) is missing such check. Modified ctx is impossible here as bpf_skb_load_helper_8_no_cache() and others are expecting skb fields in original position, hence, add check_ctx_reg() to reject any modified ctx. Issue was first introduced back in f1174f77b50c ("bpf/verifier: rework value tracking"). Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Reported-by: Anatoly Trosinenko Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200106215157.3553-1-daniel@iogearbox.net --- kernel/bpf/verifier.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6f63ae7a370c..ce85e7041f0c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6264,6 +6264,7 @@ static bool may_access_skb(enum bpf_prog_type type) static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_reg_state *regs = cur_regs(env); + static const int ctx_reg = BPF_REG_6; u8 mode = BPF_MODE(insn->code); int i, err; @@ -6297,7 +6298,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check whether implicit source operand (register R6) is readable */ - err = check_reg_arg(env, BPF_REG_6, SRC_OP); + err = check_reg_arg(env, ctx_reg, SRC_OP); if (err) return err; @@ -6316,7 +6317,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } - if (regs[BPF_REG_6].type != PTR_TO_CTX) { + if (regs[ctx_reg].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); return -EINVAL; @@ -6329,6 +6330,10 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } + err = check_ctx_reg(env, ®s[ctx_reg], ctx_reg); + if (err < 0) + return err; + /* reset caller saved regs to unreadable */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); -- cgit v1.2.3 From 809ed78a832df8b4ef81fee03570566c8f088f3a Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Fri, 3 Jan 2020 23:54:58 +0800 Subject: PM: hibernate: improve arithmetic division in preallocate_highmem_fraction() do_div() does a 64-by-32 division. Use div64_u64() instead of do_div() if the divisor is u64, to avoid truncation to 32-bit. This change also cleans up code a tad. Signed-off-by: Wen Yang [ rjw: Subject ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 26b9168321e7..8a6eaf7367f3 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1566,9 +1566,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages) */ static unsigned long __fraction(u64 x, u64 multiplier, u64 base) { - x *= multiplier; - do_div(x, base); - return (unsigned long)x; + return div64_u64(x * multiplier, base); } static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, -- cgit v1.2.3 From 7a7b99bf801efd09b3202bf5b6216d70cb1363f1 Mon Sep 17 00:00:00 2001 From: Luigi Semenzato Date: Thu, 2 Jan 2020 15:19:40 -0800 Subject: PM: hibernate: Add more logging on hibernation failure Hibernation fails when the kernel cannot allocate enough memory to copy all pages of RAM in use. Ensure that the failure reason is clearly logged, and clearly attributable to the hibernation module. Signed-off-by: Luigi Semenzato [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 23 ++++++++++++----------- kernel/power/snapshot.c | 24 +++++++++++++++--------- 2 files changed, 27 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 3c0a5a8170b0..6dbeedb7354c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -9,7 +9,7 @@ * Copyright (C) 2012 Bojan Smojver */ -#define pr_fmt(fmt) "PM: " fmt +#define pr_fmt(fmt) "PM: hibernation: " fmt #include #include @@ -106,7 +106,7 @@ EXPORT_SYMBOL(system_entering_hibernation); #ifdef CONFIG_PM_DEBUG static void hibernation_debug_sleep(void) { - pr_info("hibernation debug: Waiting for 5 seconds.\n"); + pr_info("debug: Waiting for 5 seconds.\n"); mdelay(5000); } @@ -277,7 +277,7 @@ static int create_image(int platform_mode) error = dpm_suspend_end(PMSG_FREEZE); if (error) { - pr_err("Some devices failed to power down, aborting hibernation\n"); + pr_err("Some devices failed to power down, aborting\n"); return error; } @@ -295,7 +295,7 @@ static int create_image(int platform_mode) error = syscore_suspend(); if (error) { - pr_err("Some system devices failed to power down, aborting hibernation\n"); + pr_err("Some system devices failed to power down, aborting\n"); goto Enable_irqs; } @@ -310,7 +310,7 @@ static int create_image(int platform_mode) restore_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); if (error) - pr_err("Error %d creating hibernation image\n", error); + pr_err("Error %d creating image\n", error); if (!in_suspend) { events_check_enabled = false; @@ -680,7 +680,7 @@ static int load_image_and_restore(void) if (!error) hibernation_restore(flags & SF_PLATFORM_MODE); - pr_err("Failed to load hibernation image, recovering.\n"); + pr_err("Failed to load image, recovering.\n"); swsusp_free(); free_basic_memory_bitmaps(); Unlock: @@ -743,7 +743,7 @@ int hibernate(void) else flags |= SF_CRC32_MODE; - pm_pr_dbg("Writing image.\n"); + pm_pr_dbg("Writing hibernation image.\n"); error = swsusp_write(flags); swsusp_free(); if (!error) { @@ -755,7 +755,7 @@ int hibernate(void) in_suspend = 0; pm_restore_gfp_mask(); } else { - pm_pr_dbg("Image restored successfully.\n"); + pm_pr_dbg("Hibernation image restored successfully.\n"); } Free_bitmaps: @@ -894,7 +894,7 @@ static int software_resume(void) goto Close_Finish; } - pm_pr_dbg("Preparing processes for restore.\n"); + pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); if (error) goto Close_Finish; @@ -903,7 +903,7 @@ static int software_resume(void) Finish: __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); pm_restore_console(); - pr_info("resume from hibernation failed (%d)\n", error); + pr_info("resume failed (%d)\n", error); atomic_inc(&snapshot_device_available); /* For success case, the suspend path will release the lock */ Unlock: @@ -1068,7 +1068,8 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, lock_system_sleep(); swsusp_resume_device = res; unlock_system_sleep(); - pm_pr_dbg("Configured resume from disk to %u\n", swsusp_resume_device); + pm_pr_dbg("Configured hibernation resume from disk to %u\n", + swsusp_resume_device); noresume = 0; software_resume(); return n; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 8a6eaf7367f3..353c5e9070ed 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -8,7 +8,7 @@ * Copyright (C) 2006 Rafael J. Wysocki */ -#define pr_fmt(fmt) "PM: " fmt +#define pr_fmt(fmt) "PM: hibernation: " fmt #include #include @@ -1703,16 +1703,20 @@ int hibernate_preallocate_memory(void) ktime_t start, stop; int error; - pr_info("Preallocating image memory... "); + pr_info("Preallocating image memory\n"); start = ktime_get(); error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); - if (error) + if (error) { + pr_err("Cannot allocate original bitmap\n"); goto err_out; + } error = memory_bm_create(©_bm, GFP_IMAGE, PG_ANY); - if (error) + if (error) { + pr_err("Cannot allocate copy bitmap\n"); goto err_out; + } alloc_normal = 0; alloc_highmem = 0; @@ -1802,8 +1806,11 @@ int hibernate_preallocate_memory(void) alloc -= pages; pages += pages_highmem; pages_highmem = preallocate_image_highmem(alloc); - if (pages_highmem < alloc) + if (pages_highmem < alloc) { + pr_err("Image allocation is %lu pages short\n", + alloc - pages_highmem); goto err_out; + } pages += pages_highmem; /* * size is the desired number of saveable pages to leave in @@ -1834,13 +1841,12 @@ int hibernate_preallocate_memory(void) out: stop = ktime_get(); - pr_cont("done (allocated %lu pages)\n", pages); + pr_info("Allocated %lu pages for shapshot\n", pages); swsusp_show_speed(start, stop, pages, "Allocated"); return 0; err_out: - pr_cont("\n"); swsusp_free(); return -ENOMEM; } @@ -1974,7 +1980,7 @@ asmlinkage __visible int swsusp_save(void) { unsigned int nr_pages, nr_highmem; - pr_info("Creating hibernation image:\n"); + pr_info("Creating image:\n"); drain_local_pages(NULL); nr_pages = count_data_pages(); @@ -2008,7 +2014,7 @@ asmlinkage __visible int swsusp_save(void) nr_copy_pages = nr_pages; nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); - pr_info("Hibernation image created (%d pages copied)\n", nr_pages); + pr_info("Image created (%d pages copied)\n", nr_pages); return 0; } -- cgit v1.2.3 From dd499f7a7e34270208350a849ef103c0b3ae477f Mon Sep 17 00:00:00 2001 From: Amanieu d'Antras Date: Thu, 2 Jan 2020 18:24:13 +0100 Subject: clone3: ensure copy_thread_tls is implemented copy_thread implementations handle CLONE_SETTLS by reading the TLS value from the registers containing the syscall arguments for clone. This doesn't work with clone3 since the TLS value is passed in clone_args instead. Signed-off-by: Amanieu d'Antras Cc: # 5.3.x Link: https://lore.kernel.org/r/20200102172413.654385-8-amanieu@gmail.com Signed-off-by: Christian Brauner --- kernel/fork.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 2508a4f238a3..080809560072 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2578,6 +2578,16 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, #endif #ifdef __ARCH_WANT_SYS_CLONE3 + +/* + * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from + * the registers containing the syscall arguments for clone. This doesn't work + * with clone3 since the TLS value is passed in clone_args instead. + */ +#ifndef CONFIG_HAVE_COPY_THREAD_TLS +#error clone3 requires copy_thread_tls support in arch +#endif + noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) -- cgit v1.2.3 From f6d061d617124abbd55396a3bc37b9bf7d33233c Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 28 Dec 2019 19:54:55 +0800 Subject: kernel/module: Fix memleak in module_add_modinfo_attrs() In module_add_modinfo_attrs() if sysfs_create_file() fails on the first iteration of the loop (so i = 0), we forget to free the modinfo_attrs. Fixes: bc6f2a757d52 ("kernel/module: Fix mem leak in module_add_modinfo_attrs") Reviewed-by: Miroslav Benes Signed-off-by: YueHaibing Signed-off-by: Jessica Yu --- kernel/module.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 155349275b8e..d4d876172e9d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1784,6 +1784,8 @@ static int module_add_modinfo_attrs(struct module *mod) error_out: if (i > 0) module_remove_modinfo_attrs(mod, --i); + else + kfree(mod->modinfo_attrs); return error; } -- cgit v1.2.3 From d42cc530b18db2dd9de621238d33670841aabc36 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 4 Dec 2019 10:59:14 -0500 Subject: kexec: quiet down kexec reboot Here is a regular kexec command sequence and output: ===== $ kexec --reuse-cmdline -i --load Image $ kexec -e [ 161.342002] kexec_core: Starting new kernel Welcome to Buildroot buildroot login: ===== Even when "quiet" kernel parameter is specified, "kexec_core: Starting new kernel" is printed. This message has KERN_EMERG level, but there is no emergency, it is a normal kexec operation, so quiet it down to appropriate KERN_NOTICE. Machines that have slow console baud rate benefit from less output. Signed-off-by: Pavel Tatashin Reviewed-by: Simon Horman Acked-by: Dave Young Signed-off-by: Will Deacon --- kernel/kexec_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 15d70a90b50d..f7ae04b8de6f 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1171,7 +1171,7 @@ int kernel_kexec(void) * CPU hotplug again; so re-enable it here. */ cpu_hotplug_enable(); - pr_emerg("Starting new kernel\n"); + pr_notice("Starting new kernel\n"); machine_shutdown(); } -- cgit v1.2.3 From de68e4daea9084df4c614d31e2061d5d31bf24f4 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 4 Dec 2019 10:59:15 -0500 Subject: kexec: add machine_kexec_post_load() It is the same as machine_kexec_prepare(), but is called after segments are loaded. This way, can do processing work with already loaded relocation segments. One such example is arm64: it has to have segments loaded in order to create a page table, but it cannot do it during kexec time, because at that time allocations won't be possible anymore. Signed-off-by: Pavel Tatashin Acked-by: Dave Young Signed-off-by: Will Deacon --- kernel/kexec.c | 4 ++++ kernel/kexec_core.c | 6 ++++++ kernel/kexec_file.c | 4 ++++ kernel/kexec_internal.h | 2 ++ 4 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index bc933c0db9bf..f977786fe498 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -159,6 +159,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, kimage_terminate(image); + ret = machine_kexec_post_load(image); + if (ret) + goto out; + /* Install the new kernel and uninstall the old */ image = xchg(dest_image, image); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index f7ae04b8de6f..c19c0dad1ebe 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -589,6 +589,12 @@ static void kimage_free_extra_pages(struct kimage *image) kimage_free_page_list(&image->unusable_pages); } + +int __weak machine_kexec_post_load(struct kimage *image) +{ + return 0; +} + void kimage_terminate(struct kimage *image) { if (*image->entry != 0) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index a2df93948665..faa74d5f6941 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -441,6 +441,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, kimage_terminate(image); + ret = machine_kexec_post_load(image); + if (ret) + goto out; + /* * Free up any temporary buffers allocated which are not needed * after image has been loaded diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 48aaf2ac0d0d..39d30ccf8d87 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -13,6 +13,8 @@ void kimage_terminate(struct kimage *image); int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); +int machine_kexec_post_load(struct kimage *image); + extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE -- cgit v1.2.3 From e4add247789e4ba5e08ad8256183ce2e211877d4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 7 Jan 2020 23:42:24 +0900 Subject: kprobes: Fix optimize_kprobe()/unoptimize_kprobe() cancellation logic optimize_kprobe() and unoptimize_kprobe() cancels if a given kprobe is on the optimizing_list or unoptimizing_list already. However, since the following commit: f66c0447cca1 ("kprobes: Set unoptimized flag after unoptimizing code") modified the update timing of the KPROBE_FLAG_OPTIMIZED, it doesn't work as expected anymore. The optimized_kprobe could be in the following states: - [optimizing]: Before inserting jump instruction op.kp->flags has KPROBE_FLAG_OPTIMIZED and op->list is not empty. - [optimized]: jump inserted op.kp->flags has KPROBE_FLAG_OPTIMIZED and op->list is empty. - [unoptimizing]: Before removing jump instruction (including unused optprobe) op.kp->flags has KPROBE_FLAG_OPTIMIZED and op->list is not empty. - [unoptimized]: jump removed op.kp->flags doesn't have KPROBE_FLAG_OPTIMIZED and op->list is empty. Current code mis-expects [unoptimizing] state doesn't have KPROBE_FLAG_OPTIMIZED, and that can cause incorrect results. To fix this, introduce optprobe_queued_unopt() to distinguish [optimizing] and [unoptimizing] states and fixes the logic in optimize_kprobe() and unoptimize_kprobe(). [ mingo: Cleaned up the changelog and the code a bit. ] Signed-off-by: Masami Hiramatsu Reviewed-by: Steven Rostedt (VMware) Cc: Alexei Starovoitov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Fixes: f66c0447cca1 ("kprobes: Set unoptimized flag after unoptimizing code") Link: https://lkml.kernel.org/r/157840814418.7181.13478003006386303481.stgit@devnote2 Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 67 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 34e28b236d68..2625c241ac00 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -612,6 +612,18 @@ void wait_for_kprobe_optimizer(void) mutex_unlock(&kprobe_mutex); } +static bool optprobe_queued_unopt(struct optimized_kprobe *op) +{ + struct optimized_kprobe *_op; + + list_for_each_entry(_op, &unoptimizing_list, list) { + if (op == _op) + return true; + } + + return false; +} + /* Optimize kprobe if p is ready to be optimized */ static void optimize_kprobe(struct kprobe *p) { @@ -633,17 +645,21 @@ static void optimize_kprobe(struct kprobe *p) return; /* Check if it is already optimized. */ - if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) + if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) { + if (optprobe_queued_unopt(op)) { + /* This is under unoptimizing. Just dequeue the probe */ + list_del_init(&op->list); + } return; + } op->kp.flags |= KPROBE_FLAG_OPTIMIZED; - if (!list_empty(&op->list)) - /* This is under unoptimizing. Just dequeue the probe */ - list_del_init(&op->list); - else { - list_add(&op->list, &optimizing_list); - kick_kprobe_optimizer(); - } + /* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */ + if (WARN_ON_ONCE(!list_empty(&op->list))) + return; + + list_add(&op->list, &optimizing_list); + kick_kprobe_optimizer(); } /* Short cut to direct unoptimizing */ @@ -665,30 +681,33 @@ static void unoptimize_kprobe(struct kprobe *p, bool force) return; /* This is not an optprobe nor optimized */ op = container_of(p, struct optimized_kprobe, kp); - if (!kprobe_optimized(p)) { - /* Unoptimized or unoptimizing case */ - if (force && !list_empty(&op->list)) { - /* - * Only if this is unoptimizing kprobe and forced, - * forcibly unoptimize it. (No need to unoptimize - * unoptimized kprobe again :) - */ - list_del_init(&op->list); - force_unoptimize_kprobe(op); - } + if (!kprobe_optimized(p)) return; - } if (!list_empty(&op->list)) { - /* Dequeue from the optimization queue */ - list_del_init(&op->list); + if (optprobe_queued_unopt(op)) { + /* Queued in unoptimizing queue */ + if (force) { + /* + * Forcibly unoptimize the kprobe here, and queue it + * in the freeing list for release afterwards. + */ + force_unoptimize_kprobe(op); + list_move(&op->list, &freeing_list); + } + } else { + /* Dequeue from the optimizing queue */ + list_del_init(&op->list); + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + } return; } + /* Optimized kprobe case */ - if (force) + if (force) { /* Forcibly update the code: this is a special case */ force_unoptimize_kprobe(op); - else { + } else { list_add(&op->list, &unoptimizing_list); kick_kprobe_optimizer(); } -- cgit v1.2.3 From 51bfb1d11d6daf095addf9fe8471c20992caae0b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 8 Dec 2019 20:26:55 -0800 Subject: futex: Fix kernel-doc notation warning Fix a kernel-doc warning in kernel/futex.c by adding notation for @ret. ../kernel/futex.c:1187: warning: Function parameter or member 'ret' not described in 'wait_for_owner_exiting' Fixes: 3ef240eaff36 ("futex: Prevent exit livelock") Signed-off-by: Randy Dunlap Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/223be78c-f3c8-52df-836d-c5fb8e7907e9@infradead.org --- kernel/futex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 03c518e9747e..0cf84c8664f2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1178,6 +1178,7 @@ out_error: /** * wait_for_owner_exiting - Block until the owner has exited + * @ret: owner's current futex lock status * @exiting: Pointer to the exiting task * * Caller must hold a refcount on @exiting. -- cgit v1.2.3 From 025af39b87dc4dc78de4e861ca8b88a1d5ba89f6 Mon Sep 17 00:00:00 2001 From: Luca Ceresoli Date: Tue, 5 Nov 2019 15:08:54 +0100 Subject: genirq: Show irq name in non-oneshot error message Requesting a threaded IRQ with handler=NULL and !ONESHOT fails, but the error message does not include the IRQ line name, which makes it harder to find the offending driver. Print the IRQ line name to clarify where the error comes from. Use the same format as the other pr_err() above in the same function. Signed-off-by: Luca Ceresoli Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191105140854.27893-1-luca@lucaceresoli.net --- kernel/irq/manage.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1753486b440c..b6c53ab053d2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1500,8 +1500,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * has. The type flags are unreliable as the * underlying chip implementation can override them. */ - pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", - irq); + pr_err("Threaded irq requested with handler=NULL and !ONESHOT for %s (irq %d)\n", + new->name, irq); ret = -EINVAL; goto out_unlock; } -- cgit v1.2.3 From dc8d37ed304eeeea47e65fb9edc1c6c8b0093386 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Dec 2019 20:56:04 +0100 Subject: cpu/SMT: Fix x86 link error without CONFIG_SYSFS When CONFIG_SYSFS is disabled, but CONFIG_HOTPLUG_SMT is enabled, the kernel fails to link: arch/x86/power/cpu.o: In function `hibernate_resume_nonboot_cpu_disable': (.text+0x38d): undefined reference to `cpuhp_smt_enable' arch/x86/power/hibernate.o: In function `arch_resume_nosmt': hibernate.c:(.text+0x291): undefined reference to `cpuhp_smt_enable' hibernate.c:(.text+0x29c): undefined reference to `cpuhp_smt_disable' Move the exported functions out of the #ifdef section into its own with the correct conditions. The patch that caused this is marked for stable backports, so this one may need to be backported as well. Fixes: ec527c318036 ("x86/power: Fix 'nosmt' vs hibernation triple fault during resume") Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Reviewed-by: Jiri Kosina Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20191210195614.786555-1-arnd@arndb.de --- kernel/cpu.c | 143 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 72 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index a59cc980adad..4dc279ed3b2d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1909,6 +1909,78 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) } EXPORT_SYMBOL(__cpuhp_remove_state); +#ifdef CONFIG_HOTPLUG_SMT +static void cpuhp_offline_cpu_device(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + dev->offline = true; + /* Tell user space about the state change */ + kobject_uevent(&dev->kobj, KOBJ_OFFLINE); +} + +static void cpuhp_online_cpu_device(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + dev->offline = false; + /* Tell user space about the state change */ + kobject_uevent(&dev->kobj, KOBJ_ONLINE); +} + +int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) +{ + int cpu, ret = 0; + + cpu_maps_update_begin(); + for_each_online_cpu(cpu) { + if (topology_is_primary_thread(cpu)) + continue; + ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); + if (ret) + break; + /* + * As this needs to hold the cpu maps lock it's impossible + * to call device_offline() because that ends up calling + * cpu_down() which takes cpu maps lock. cpu maps lock + * needs to be held as this might race against in kernel + * abusers of the hotplug machinery (thermal management). + * + * So nothing would update device:offline state. That would + * leave the sysfs entry stale and prevent onlining after + * smt control has been changed to 'off' again. This is + * called under the sysfs hotplug lock, so it is properly + * serialized against the regular offline usage. + */ + cpuhp_offline_cpu_device(cpu); + } + if (!ret) + cpu_smt_control = ctrlval; + cpu_maps_update_done(); + return ret; +} + +int cpuhp_smt_enable(void) +{ + int cpu, ret = 0; + + cpu_maps_update_begin(); + cpu_smt_control = CPU_SMT_ENABLED; + for_each_present_cpu(cpu) { + /* Skip online CPUs and CPUs on offline nodes */ + if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) + continue; + ret = _cpu_up(cpu, 0, CPUHP_ONLINE); + if (ret) + break; + /* See comment in cpuhp_smt_disable() */ + cpuhp_online_cpu_device(cpu); + } + cpu_maps_update_done(); + return ret; +} +#endif + #if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU) static ssize_t show_cpuhp_state(struct device *dev, struct device_attribute *attr, char *buf) @@ -2063,77 +2135,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = { #ifdef CONFIG_HOTPLUG_SMT -static void cpuhp_offline_cpu_device(unsigned int cpu) -{ - struct device *dev = get_cpu_device(cpu); - - dev->offline = true; - /* Tell user space about the state change */ - kobject_uevent(&dev->kobj, KOBJ_OFFLINE); -} - -static void cpuhp_online_cpu_device(unsigned int cpu) -{ - struct device *dev = get_cpu_device(cpu); - - dev->offline = false; - /* Tell user space about the state change */ - kobject_uevent(&dev->kobj, KOBJ_ONLINE); -} - -int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) -{ - int cpu, ret = 0; - - cpu_maps_update_begin(); - for_each_online_cpu(cpu) { - if (topology_is_primary_thread(cpu)) - continue; - ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); - if (ret) - break; - /* - * As this needs to hold the cpu maps lock it's impossible - * to call device_offline() because that ends up calling - * cpu_down() which takes cpu maps lock. cpu maps lock - * needs to be held as this might race against in kernel - * abusers of the hotplug machinery (thermal management). - * - * So nothing would update device:offline state. That would - * leave the sysfs entry stale and prevent onlining after - * smt control has been changed to 'off' again. This is - * called under the sysfs hotplug lock, so it is properly - * serialized against the regular offline usage. - */ - cpuhp_offline_cpu_device(cpu); - } - if (!ret) - cpu_smt_control = ctrlval; - cpu_maps_update_done(); - return ret; -} - -int cpuhp_smt_enable(void) -{ - int cpu, ret = 0; - - cpu_maps_update_begin(); - cpu_smt_control = CPU_SMT_ENABLED; - for_each_present_cpu(cpu) { - /* Skip online CPUs and CPUs on offline nodes */ - if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) - continue; - ret = _cpu_up(cpu, 0, CPUHP_ONLINE); - if (ret) - break; - /* See comment in cpuhp_smt_disable() */ - cpuhp_online_cpu_device(cpu); - } - cpu_maps_update_done(); - return ret; -} - - static ssize_t __store_smt_control(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) -- cgit v1.2.3 From 65726b5b7efae718391752ae8bb85b9843fd83f4 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:34:54 -0800 Subject: bpf: Save PTR_TO_BTF_ID register state when spilling to stack This patch makes the verifier save the PTR_TO_BTF_ID register state when spilling to the stack. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003454.3854870-1-kafai@fb.com --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6f63ae7a370c..d433d70022fd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1916,6 +1916,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: + case PTR_TO_BTF_ID: return true; default: return false; -- cgit v1.2.3 From 275517ff452a535da5eef25b1c22e53fc50b0a12 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:34:56 -0800 Subject: bpf: Avoid storing modifier to info->btf_id info->btf_id expects the btf_id of a struct, so it should store the final result after skipping modifiers (if any). It also takes this chanace to add a missing newline in one of the bpf_log() messages. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003456.3855176-1-kafai@fb.com --- kernel/bpf/btf.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ed2075884724..497ecf62d79d 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3697,7 +3697,6 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* this is a pointer to another type */ info->reg_type = PTR_TO_BTF_ID; - info->btf_id = t->type; if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type); @@ -3708,10 +3707,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return false; } } + + info->btf_id = t->type; t = btf_type_by_id(btf, t->type); /* skip modifiers */ - while (btf_type_is_modifier(t)) + while (btf_type_is_modifier(t)) { + info->btf_id = t->type; t = btf_type_by_id(btf, t->type); + } if (!btf_type_is_struct(t)) { bpf_log(log, "func '%s' arg%d type %s is not a struct\n", @@ -3737,7 +3740,7 @@ int btf_struct_access(struct bpf_verifier_log *log, again: tname = __btf_name_by_offset(btf_vmlinux, t->name_off); if (!btf_type_is_struct(t)) { - bpf_log(log, "Type '%s' is not a struct", tname); + bpf_log(log, "Type '%s' is not a struct\n", tname); return -EINVAL; } -- cgit v1.2.3 From 218b3f65f9081f5e1bffe6de5f0f4b22c935410b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:34:59 -0800 Subject: bpf: Add enum support to btf_ctx_access() It allows bpf prog (e.g. tracing) to attach to a kernel function that takes enum argument. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003459.3855366-1-kafai@fb.com --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 497ecf62d79d..6a5ccb748a72 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3677,7 +3677,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t)) + if (btf_type_is_int(t) || btf_type_is_enum(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { -- cgit v1.2.3 From 976aba002fcb4b1baa71e4b0854f3d4ae48c1d4d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:35:01 -0800 Subject: bpf: Support bitfield read access in btf_struct_access This patch allows bitfield access as a scalar. It checks "off + size > t->size" to avoid accessing bitfield end up accessing beyond the struct. This check is done outside of the loop since it is applicable to all access. It also takes this chance to break early on the "off < moff" case. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003501.3855427-1-kafai@fb.com --- kernel/bpf/btf.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6a5ccb748a72..48bbde2e1c1e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3744,19 +3744,53 @@ again: return -EINVAL; } - for_each_member(i, t, member) { - if (btf_member_bitfield_size(t, member)) - /* bitfields are not supported yet */ - continue; + if (off + size > t->size) { + bpf_log(log, "access beyond struct %s at off %u size %u\n", + tname, off, size); + return -EACCES; + } + for_each_member(i, t, member) { /* offset of the field in bytes */ moff = btf_member_bit_offset(t, member) / 8; if (off + size <= moff) /* won't find anything, field is already too far */ break; + + if (btf_member_bitfield_size(t, member)) { + u32 end_bit = btf_member_bit_offset(t, member) + + btf_member_bitfield_size(t, member); + + /* off <= moff instead of off == moff because clang + * does not generate a BTF member for anonymous + * bitfield like the ":16" here: + * struct { + * int :16; + * int x:8; + * }; + */ + if (off <= moff && + BITS_ROUNDUP_BYTES(end_bit) <= off + size) + return SCALAR_VALUE; + + /* off may be accessing a following member + * + * or + * + * Doing partial access at either end of this + * bitfield. Continue on this case also to + * treat it as not accessing this bitfield + * and eventually error out as field not + * found to keep it simple. + * It could be relaxed if there was a legit + * partial access case later. + */ + continue; + } + /* In case of "off" is pointing to holes of a struct */ if (off < moff) - continue; + break; /* type of the field */ mtype = btf_type_by_id(btf_vmlinux, member->type); -- cgit v1.2.3 From 27ae7997a66174cb8afd6a75b3989f5e0c1b9e5a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:35:03 -0800 Subject: bpf: Introduce BPF_PROG_TYPE_STRUCT_OPS This patch allows the kernel's struct ops (i.e. func ptr) to be implemented in BPF. The first use case in this series is the "struct tcp_congestion_ops" which will be introduced in a latter patch. This patch introduces a new prog type BPF_PROG_TYPE_STRUCT_OPS. The BPF_PROG_TYPE_STRUCT_OPS prog is verified against a particular func ptr of a kernel struct. The attr->attach_btf_id is the btf id of a kernel struct. The attr->expected_attach_type is the member "index" of that kernel struct. The first member of a struct starts with member index 0. That will avoid ambiguity when a kernel struct has multiple func ptrs with the same func signature. For example, a BPF_PROG_TYPE_STRUCT_OPS prog is written to implement the "init" func ptr of the "struct tcp_congestion_ops". The attr->attach_btf_id is the btf id of the "struct tcp_congestion_ops" of the _running_ kernel. The attr->expected_attach_type is 3. The ctx of BPF_PROG_TYPE_STRUCT_OPS is an array of u64 args saved by arch_prepare_bpf_trampoline that will be done in the next patch when introducing BPF_MAP_TYPE_STRUCT_OPS. "struct bpf_struct_ops" is introduced as a common interface for the kernel struct that supports BPF_PROG_TYPE_STRUCT_OPS prog. The supporting kernel struct will need to implement an instance of the "struct bpf_struct_ops". The supporting kernel struct also needs to implement a bpf_verifier_ops. During BPF_PROG_LOAD, bpf_struct_ops_find() will find the right bpf_verifier_ops by searching the attr->attach_btf_id. A new "btf_struct_access" is also added to the bpf_verifier_ops such that the supporting kernel struct can optionally provide its own specific check on accessing the func arg (e.g. provide limited write access). After btf_vmlinux is parsed, the new bpf_struct_ops_init() is called to initialize some values (e.g. the btf id of the supporting kernel struct) and it can only be done once the btf_vmlinux is available. The R0 checks at BPF_EXIT is excluded for the BPF_PROG_TYPE_STRUCT_OPS prog if the return type of the prog->aux->attach_func_proto is "void". Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003503.3855825-1-kafai@fb.com --- kernel/bpf/Makefile | 3 + kernel/bpf/bpf_struct_ops.c | 121 ++++++++++++++++++++++++++++++++++ kernel/bpf/bpf_struct_ops_types.h | 4 ++ kernel/bpf/btf.c | 88 +++++++++++++++++-------- kernel/bpf/syscall.c | 17 +++-- kernel/bpf/verifier.c | 134 +++++++++++++++++++++++++++++--------- 6 files changed, 304 insertions(+), 63 deletions(-) create mode 100644 kernel/bpf/bpf_struct_ops.c create mode 100644 kernel/bpf/bpf_struct_ops_types.h (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index d4f330351f87..046ce5d98033 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -27,3 +27,6 @@ endif ifeq ($(CONFIG_SYSFS),y) obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o endif +ifeq ($(CONFIG_BPF_JIT),y) +obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o +endif diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c new file mode 100644 index 000000000000..2ea68fe34c33 --- /dev/null +++ b/kernel/bpf/bpf_struct_ops.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2019 Facebook */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define BPF_STRUCT_OPS_TYPE(_name) \ +extern struct bpf_struct_ops bpf_##_name; +#include "bpf_struct_ops_types.h" +#undef BPF_STRUCT_OPS_TYPE + +enum { +#define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name, +#include "bpf_struct_ops_types.h" +#undef BPF_STRUCT_OPS_TYPE + __NR_BPF_STRUCT_OPS_TYPE, +}; + +static struct bpf_struct_ops * const bpf_struct_ops[] = { +#define BPF_STRUCT_OPS_TYPE(_name) \ + [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name, +#include "bpf_struct_ops_types.h" +#undef BPF_STRUCT_OPS_TYPE +}; + +const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { +}; + +const struct bpf_prog_ops bpf_struct_ops_prog_ops = { +}; + +void bpf_struct_ops_init(struct btf *btf) +{ + const struct btf_member *member; + struct bpf_struct_ops *st_ops; + struct bpf_verifier_log log = {}; + const struct btf_type *t; + const char *mname; + s32 type_id; + u32 i, j; + + for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { + st_ops = bpf_struct_ops[i]; + + type_id = btf_find_by_name_kind(btf, st_ops->name, + BTF_KIND_STRUCT); + if (type_id < 0) { + pr_warn("Cannot find struct %s in btf_vmlinux\n", + st_ops->name); + continue; + } + t = btf_type_by_id(btf, type_id); + if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) { + pr_warn("Cannot support #%u members in struct %s\n", + btf_type_vlen(t), st_ops->name); + continue; + } + + for_each_member(j, t, member) { + const struct btf_type *func_proto; + + mname = btf_name_by_offset(btf, member->name_off); + if (!*mname) { + pr_warn("anon member in struct %s is not supported\n", + st_ops->name); + break; + } + + if (btf_member_bitfield_size(t, member)) { + pr_warn("bit field member %s in struct %s is not supported\n", + mname, st_ops->name); + break; + } + + func_proto = btf_type_resolve_func_ptr(btf, + member->type, + NULL); + if (func_proto && + btf_distill_func_proto(&log, btf, + func_proto, mname, + &st_ops->func_models[j])) { + pr_warn("Error in parsing func ptr %s in struct %s\n", + mname, st_ops->name); + break; + } + } + + if (j == btf_type_vlen(t)) { + if (st_ops->init(btf)) { + pr_warn("Error in init bpf_struct_ops %s\n", + st_ops->name); + } else { + st_ops->type_id = type_id; + st_ops->type = t; + } + } + } +} + +extern struct btf *btf_vmlinux; + +const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) +{ + unsigned int i; + + if (!type_id || !btf_vmlinux) + return NULL; + + for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { + if (bpf_struct_ops[i]->type_id == type_id) + return bpf_struct_ops[i]; + } + + return NULL; +} diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h new file mode 100644 index 000000000000..7bb13ff49ec2 --- /dev/null +++ b/kernel/bpf/bpf_struct_ops_types.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* internal file - do not include directly */ + +/* To be filled in a later patch */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 48bbde2e1c1e..12af4a1bb1a4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -180,11 +180,6 @@ */ #define BTF_MAX_SIZE (16 * 1024 * 1024) -#define for_each_member(i, struct_type, member) \ - for (i = 0, member = btf_type_member(struct_type); \ - i < btf_type_vlen(struct_type); \ - i++, member++) - #define for_each_member_from(i, from, struct_type, member) \ for (i = from, member = btf_type_member(struct_type) + from; \ i < btf_type_vlen(struct_type); \ @@ -382,6 +377,65 @@ static bool btf_type_is_datasec(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } +s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) +{ + const struct btf_type *t; + const char *tname; + u32 i; + + for (i = 1; i <= btf->nr_types; i++) { + t = btf->types[i]; + if (BTF_INFO_KIND(t->info) != kind) + continue; + + tname = btf_name_by_offset(btf, t->name_off); + if (!strcmp(tname, name)) + return i; + } + + return -ENOENT; +} + +const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, + u32 id, u32 *res_id) +{ + const struct btf_type *t = btf_type_by_id(btf, id); + + while (btf_type_is_modifier(t)) { + id = t->type; + t = btf_type_by_id(btf, t->type); + } + + if (res_id) + *res_id = id; + + return t; +} + +const struct btf_type *btf_type_resolve_ptr(const struct btf *btf, + u32 id, u32 *res_id) +{ + const struct btf_type *t; + + t = btf_type_skip_modifiers(btf, id, NULL); + if (!btf_type_is_ptr(t)) + return NULL; + + return btf_type_skip_modifiers(btf, t->type, res_id); +} + +const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf, + u32 id, u32 *res_id) +{ + const struct btf_type *ptype; + + ptype = btf_type_resolve_ptr(btf, id, res_id); + if (ptype && btf_type_is_func_proto(ptype)) + return ptype; + + return NULL; +} + /* Types that act only as a source, not sink or intermediate * type when resolving. */ @@ -446,16 +500,6 @@ static const char *btf_int_encoding_str(u8 encoding) return "UNKN"; } -static u16 btf_type_vlen(const struct btf_type *t) -{ - return BTF_INFO_VLEN(t->info); -} - -static bool btf_type_kflag(const struct btf_type *t) -{ - return BTF_INFO_KFLAG(t->info); -} - static u32 btf_member_bit_offset(const struct btf_type *struct_type, const struct btf_member *member) { @@ -463,13 +507,6 @@ static u32 btf_member_bit_offset(const struct btf_type *struct_type, : member->offset; } -static u32 btf_member_bitfield_size(const struct btf_type *struct_type, - const struct btf_member *member) -{ - return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset) - : 0; -} - static u32 btf_type_int(const struct btf_type *t) { return *(u32 *)(t + 1); @@ -480,11 +517,6 @@ static const struct btf_array *btf_type_array(const struct btf_type *t) return (const struct btf_array *)(t + 1); } -static const struct btf_member *btf_type_member(const struct btf_type *t) -{ - return (const struct btf_member *)(t + 1); -} - static const struct btf_enum *btf_type_enum(const struct btf_type *t) { return (const struct btf_enum *)(t + 1); @@ -3605,6 +3637,8 @@ struct btf *btf_parse_vmlinux(void) goto errout; } + bpf_struct_ops_init(btf); + btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 81ee8595dfee..03a02ef4c496 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1672,17 +1672,22 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, u32 btf_id, u32 prog_fd) { - switch (prog_type) { - case BPF_PROG_TYPE_TRACING: + if (btf_id) { if (btf_id > BTF_MAX_TYPE) return -EINVAL; - break; - default: - if (btf_id || prog_fd) + + switch (prog_type) { + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_STRUCT_OPS: + break; + default: return -EINVAL; - break; + } } + if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING) + return -EINVAL; + switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK: switch (expected_attach_type) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d433d70022fd..586ed3c94b80 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2859,11 +2859,6 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, u32 btf_id; int ret; - if (atype != BPF_READ) { - verbose(env, "only read is supported\n"); - return -EACCES; - } - if (off < 0) { verbose(env, "R%d is ptr_%s invalid negative access: off=%d\n", @@ -2880,17 +2875,32 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } - ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id); + if (env->ops->btf_struct_access) { + ret = env->ops->btf_struct_access(&env->log, t, off, size, + atype, &btf_id); + } else { + if (atype != BPF_READ) { + verbose(env, "only read is supported\n"); + return -EACCES; + } + + ret = btf_struct_access(&env->log, t, off, size, atype, + &btf_id); + } + if (ret < 0) return ret; - if (ret == SCALAR_VALUE) { - mark_reg_unknown(env, regs, value_regno); - return 0; + if (atype == BPF_READ) { + if (ret == SCALAR_VALUE) { + mark_reg_unknown(env, regs, value_regno); + return 0; + } + mark_reg_known_zero(env, regs, value_regno); + regs[value_regno].type = PTR_TO_BTF_ID; + regs[value_regno].btf_id = btf_id; } - mark_reg_known_zero(env, regs, value_regno); - regs[value_regno].type = PTR_TO_BTF_ID; - regs[value_regno].btf_id = btf_id; + return 0; } @@ -6349,8 +6359,30 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) static int check_return_code(struct bpf_verifier_env *env) { struct tnum enforce_attach_type_range = tnum_unknown; + const struct bpf_prog *prog = env->prog; struct bpf_reg_state *reg; struct tnum range = tnum_range(0, 1); + int err; + + /* The struct_ops func-ptr's return type could be "void" */ + if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS && + !prog->aux->attach_func_proto->type) + return 0; + + /* eBPF calling convetion is such that R0 is used + * to return the value from eBPF program. + * Make sure that it's readable at this time + * of bpf_exit, which means that program wrote + * something into it earlier + */ + err = check_reg_arg(env, BPF_REG_0, SRC_OP); + if (err) + return err; + + if (is_pointer_value(env, BPF_REG_0)) { + verbose(env, "R0 leaks addr as return value\n"); + return -EACCES; + } switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: @@ -8016,21 +8048,6 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - /* eBPF calling convetion is such that R0 is used - * to return the value from eBPF program. - * Make sure that it's readable at this time - * of bpf_exit, which means that program wrote - * something into it earlier - */ - err = check_reg_arg(env, BPF_REG_0, SRC_OP); - if (err) - return err; - - if (is_pointer_value(env, BPF_REG_0)) { - verbose(env, "R0 leaks addr as return value\n"); - return -EACCES; - } - err = check_return_code(env); if (err) return err; @@ -8829,12 +8846,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) convert_ctx_access = bpf_xdp_sock_convert_ctx_access; break; case PTR_TO_BTF_ID: - if (type == BPF_WRITE) { + if (type == BPF_READ) { + insn->code = BPF_LDX | BPF_PROBE_MEM | + BPF_SIZE((insn)->code); + env->prog->aux->num_exentries++; + } else if (env->prog->type != BPF_PROG_TYPE_STRUCT_OPS) { verbose(env, "Writes through BTF pointers are not allowed\n"); return -EINVAL; } - insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); - env->prog->aux->num_exentries++; continue; default: continue; @@ -9502,6 +9521,58 @@ static void print_verification_stats(struct bpf_verifier_env *env) env->peak_states, env->longest_mark_read_walk); } +static int check_struct_ops_btf_id(struct bpf_verifier_env *env) +{ + const struct btf_type *t, *func_proto; + const struct bpf_struct_ops *st_ops; + const struct btf_member *member; + struct bpf_prog *prog = env->prog; + u32 btf_id, member_idx; + const char *mname; + + btf_id = prog->aux->attach_btf_id; + st_ops = bpf_struct_ops_find(btf_id); + if (!st_ops) { + verbose(env, "attach_btf_id %u is not a supported struct\n", + btf_id); + return -ENOTSUPP; + } + + t = st_ops->type; + member_idx = prog->expected_attach_type; + if (member_idx >= btf_type_vlen(t)) { + verbose(env, "attach to invalid member idx %u of struct %s\n", + member_idx, st_ops->name); + return -EINVAL; + } + + member = &btf_type_member(t)[member_idx]; + mname = btf_name_by_offset(btf_vmlinux, member->name_off); + func_proto = btf_type_resolve_func_ptr(btf_vmlinux, member->type, + NULL); + if (!func_proto) { + verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n", + mname, member_idx, st_ops->name); + return -EINVAL; + } + + if (st_ops->check_member) { + int err = st_ops->check_member(t, member); + + if (err) { + verbose(env, "attach to unsupported member %s of struct %s\n", + mname, st_ops->name); + return err; + } + } + + prog->aux->attach_func_proto = func_proto; + prog->aux->attach_func_name = mname; + env->ops = st_ops->verifier_ops; + + return 0; +} + static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; @@ -9517,6 +9588,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) long addr; u64 key; + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) + return check_struct_ops_btf_id(env); + if (prog->type != BPF_PROG_TYPE_TRACING) return 0; -- cgit v1.2.3 From 85d33df357b634649ddbe0a20fd2d0fc5732c3cb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:35:05 -0800 Subject: bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS The patch introduces BPF_MAP_TYPE_STRUCT_OPS. The map value is a kernel struct with its func ptr implemented in bpf prog. This new map is the interface to register/unregister/introspect a bpf implemented kernel struct. The kernel struct is actually embedded inside another new struct (or called the "value" struct in the code). For example, "struct tcp_congestion_ops" is embbeded in: struct bpf_struct_ops_tcp_congestion_ops { refcount_t refcnt; enum bpf_struct_ops_state state; struct tcp_congestion_ops data; /* <-- kernel subsystem struct here */ } The map value is "struct bpf_struct_ops_tcp_congestion_ops". The "bpftool map dump" will then be able to show the state ("inuse"/"tobefree") and the number of subsystem's refcnt (e.g. number of tcp_sock in the tcp_congestion_ops case). This "value" struct is created automatically by a macro. Having a separate "value" struct will also make extending "struct bpf_struct_ops_XYZ" easier (e.g. adding "void (*init)(void)" to "struct bpf_struct_ops_XYZ" to do some initialization works before registering the struct_ops to the kernel subsystem). The libbpf will take care of finding and populating the "struct bpf_struct_ops_XYZ" from "struct XYZ". Register a struct_ops to a kernel subsystem: 1. Load all needed BPF_PROG_TYPE_STRUCT_OPS prog(s) 2. Create a BPF_MAP_TYPE_STRUCT_OPS with attr->btf_vmlinux_value_type_id set to the btf id "struct bpf_struct_ops_tcp_congestion_ops" of the running kernel. Instead of reusing the attr->btf_value_type_id, btf_vmlinux_value_type_id s added such that attr->btf_fd can still be used as the "user" btf which could store other useful sysadmin/debug info that may be introduced in the furture, e.g. creation-date/compiler-details/map-creator...etc. 3. Create a "struct bpf_struct_ops_tcp_congestion_ops" object as described in the running kernel btf. Populate the value of this object. The function ptr should be populated with the prog fds. 4. Call BPF_MAP_UPDATE with the object created in (3) as the map value. The key is always "0". During BPF_MAP_UPDATE, the code that saves the kernel-func-ptr's args as an array of u64 is generated. BPF_MAP_UPDATE also allows the specific struct_ops to do some final checks in "st_ops->init_member()" (e.g. ensure all mandatory func ptrs are implemented). If everything looks good, it will register this kernel struct to the kernel subsystem. The map will not allow further update from this point. Unregister a struct_ops from the kernel subsystem: BPF_MAP_DELETE with key "0". Introspect a struct_ops: BPF_MAP_LOOKUP_ELEM with key "0". The map value returned will have the prog _id_ populated as the func ptr. The map value state (enum bpf_struct_ops_state) will transit from: INIT (map created) => INUSE (map updated, i.e. reg) => TOBEFREE (map value deleted, i.e. unreg) The kernel subsystem needs to call bpf_struct_ops_get() and bpf_struct_ops_put() to manage the "refcnt" in the "struct bpf_struct_ops_XYZ". This patch uses a separate refcnt for the purose of tracking the subsystem usage. Another approach is to reuse the map->refcnt and then "show" (i.e. during map_lookup) the subsystem's usage by doing map->refcnt - map->usercnt to filter out the map-fd/pinned-map usage. However, that will also tie down the future semantics of map->refcnt and map->usercnt. The very first subsystem's refcnt (during reg()) holds one count to map->refcnt. When the very last subsystem's refcnt is gone, it will also release the map->refcnt. All bpf_prog will be freed when the map->refcnt reaches 0 (i.e. during map_free()). Here is how the bpftool map command will look like: [root@arch-fb-vm1 bpf]# bpftool map show 6: struct_ops name dctcp flags 0x0 key 4B value 256B max_entries 1 memlock 4096B btf_id 6 [root@arch-fb-vm1 bpf]# bpftool map dump id 6 [{ "value": { "refcnt": { "refs": { "counter": 1 } }, "state": 1, "data": { "list": { "next": 0, "prev": 0 }, "key": 0, "flags": 2, "init": 24, "release": 0, "ssthresh": 25, "cong_avoid": 30, "set_state": 27, "cwnd_event": 28, "in_ack_event": 26, "undo_cwnd": 29, "pkts_acked": 0, "min_tso_segs": 0, "sndbuf_expand": 0, "cong_control": 0, "get_info": 0, "name": [98,112,102,95,100,99,116,99,112,0,0,0,0,0,0,0 ], "owner": 0 } } } ] Misc Notes: * bpf_struct_ops_map_sys_lookup_elem() is added for syscall lookup. It does an inplace update on "*value" instead returning a pointer to syscall.c. Otherwise, it needs a separate copy of "zero" value for the BPF_STRUCT_OPS_STATE_INIT to avoid races. * The bpf_struct_ops_map_delete_elem() is also called without preempt_disable() from map_delete_elem(). It is because the "->unreg()" may requires sleepable context, e.g. the "tcp_unregister_congestion_control()". * "const" is added to some of the existing "struct btf_func_model *" function arg to avoid a compiler warning caused by this patch. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003505.3855919-1-kafai@fb.com --- kernel/bpf/bpf_struct_ops.c | 511 +++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/btf.c | 20 +- kernel/bpf/map_in_map.c | 3 +- kernel/bpf/syscall.c | 52 +++-- kernel/bpf/trampoline.c | 8 +- kernel/bpf/verifier.c | 5 + 6 files changed, 565 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 2ea68fe34c33..ddf48f49914b 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -9,9 +9,68 @@ #include #include #include +#include +enum bpf_struct_ops_state { + BPF_STRUCT_OPS_STATE_INIT, + BPF_STRUCT_OPS_STATE_INUSE, + BPF_STRUCT_OPS_STATE_TOBEFREE, +}; + +#define BPF_STRUCT_OPS_COMMON_VALUE \ + refcount_t refcnt; \ + enum bpf_struct_ops_state state + +struct bpf_struct_ops_value { + BPF_STRUCT_OPS_COMMON_VALUE; + char data[0] ____cacheline_aligned_in_smp; +}; + +struct bpf_struct_ops_map { + struct bpf_map map; + const struct bpf_struct_ops *st_ops; + /* protect map_update */ + struct mutex lock; + /* progs has all the bpf_prog that is populated + * to the func ptr of the kernel's struct + * (in kvalue.data). + */ + struct bpf_prog **progs; + /* image is a page that has all the trampolines + * that stores the func args before calling the bpf_prog. + * A PAGE_SIZE "image" is enough to store all trampoline for + * "progs[]". + */ + void *image; + /* uvalue->data stores the kernel struct + * (e.g. tcp_congestion_ops) that is more useful + * to userspace than the kvalue. For example, + * the bpf_prog's id is stored instead of the kernel + * address of a func ptr. + */ + struct bpf_struct_ops_value *uvalue; + /* kvalue.data stores the actual kernel's struct + * (e.g. tcp_congestion_ops) that will be + * registered to the kernel subsystem. + */ + struct bpf_struct_ops_value kvalue; +}; + +#define VALUE_PREFIX "bpf_struct_ops_" +#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) + +/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is + * the map's value exposed to the userspace and its btf-type-id is + * stored at the map->btf_vmlinux_value_type_id. + * + */ #define BPF_STRUCT_OPS_TYPE(_name) \ -extern struct bpf_struct_ops bpf_##_name; +extern struct bpf_struct_ops bpf_##_name; \ + \ +struct bpf_struct_ops_##_name { \ + BPF_STRUCT_OPS_COMMON_VALUE; \ + struct _name data ____cacheline_aligned_in_smp; \ +}; #include "bpf_struct_ops_types.h" #undef BPF_STRUCT_OPS_TYPE @@ -35,19 +94,50 @@ const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { const struct bpf_prog_ops bpf_struct_ops_prog_ops = { }; +static const struct btf_type *module_type; + void bpf_struct_ops_init(struct btf *btf) { + s32 type_id, value_id, module_id; const struct btf_member *member; struct bpf_struct_ops *st_ops; struct bpf_verifier_log log = {}; const struct btf_type *t; + char value_name[128]; const char *mname; - s32 type_id; u32 i, j; + /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */ +#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name); +#include "bpf_struct_ops_types.h" +#undef BPF_STRUCT_OPS_TYPE + + module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT); + if (module_id < 0) { + pr_warn("Cannot find struct module in btf_vmlinux\n"); + return; + } + module_type = btf_type_by_id(btf, module_id); + for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { st_ops = bpf_struct_ops[i]; + if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= + sizeof(value_name)) { + pr_warn("struct_ops name %s is too long\n", + st_ops->name); + continue; + } + sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); + + value_id = btf_find_by_name_kind(btf, value_name, + BTF_KIND_STRUCT); + if (value_id < 0) { + pr_warn("Cannot find struct %s in btf_vmlinux\n", + value_name); + continue; + } + type_id = btf_find_by_name_kind(btf, st_ops->name, BTF_KIND_STRUCT); if (type_id < 0) { @@ -98,6 +188,9 @@ void bpf_struct_ops_init(struct btf *btf) } else { st_ops->type_id = type_id; st_ops->type = t; + st_ops->value_id = value_id; + st_ops->value_type = btf_type_by_id(btf, + value_id); } } } @@ -105,6 +198,22 @@ void bpf_struct_ops_init(struct btf *btf) extern struct btf *btf_vmlinux; +static const struct bpf_struct_ops * +bpf_struct_ops_find_value(u32 value_id) +{ + unsigned int i; + + if (!value_id || !btf_vmlinux) + return NULL; + + for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { + if (bpf_struct_ops[i]->value_id == value_id) + return bpf_struct_ops[i]; + } + + return NULL; +} + const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) { unsigned int i; @@ -119,3 +228,401 @@ const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) return NULL; } + +static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + if (key && *(u32 *)key == 0) + return -ENOENT; + + *(u32 *)next_key = 0; + return 0; +} + +int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, + void *value) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + struct bpf_struct_ops_value *uvalue, *kvalue; + enum bpf_struct_ops_state state; + + if (unlikely(*(u32 *)key != 0)) + return -ENOENT; + + kvalue = &st_map->kvalue; + /* Pair with smp_store_release() during map_update */ + state = smp_load_acquire(&kvalue->state); + if (state == BPF_STRUCT_OPS_STATE_INIT) { + memset(value, 0, map->value_size); + return 0; + } + + /* No lock is needed. state and refcnt do not need + * to be updated together under atomic context. + */ + uvalue = (struct bpf_struct_ops_value *)value; + memcpy(uvalue, st_map->uvalue, map->value_size); + uvalue->state = state; + refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt)); + + return 0; +} + +static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EINVAL); +} + +static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) +{ + const struct btf_type *t = st_map->st_ops->type; + u32 i; + + for (i = 0; i < btf_type_vlen(t); i++) { + if (st_map->progs[i]) { + bpf_prog_put(st_map->progs[i]); + st_map->progs[i] = NULL; + } + } +} + +static int check_zero_holes(const struct btf_type *t, void *data) +{ + const struct btf_member *member; + u32 i, moff, msize, prev_mend = 0; + const struct btf_type *mtype; + + for_each_member(i, t, member) { + moff = btf_member_bit_offset(t, member) / 8; + if (moff > prev_mend && + memchr_inv(data + prev_mend, 0, moff - prev_mend)) + return -EINVAL; + + mtype = btf_type_by_id(btf_vmlinux, member->type); + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, + NULL, NULL); + if (IS_ERR(mtype)) + return PTR_ERR(mtype); + prev_mend = moff + msize; + } + + if (t->size > prev_mend && + memchr_inv(data + prev_mend, 0, t->size - prev_mend)) + return -EINVAL; + + return 0; +} + +static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + const struct bpf_struct_ops *st_ops = st_map->st_ops; + struct bpf_struct_ops_value *uvalue, *kvalue; + const struct btf_member *member; + const struct btf_type *t = st_ops->type; + void *udata, *kdata; + int prog_fd, err = 0; + void *image; + u32 i; + + if (flags) + return -EINVAL; + + if (*(u32 *)key != 0) + return -E2BIG; + + err = check_zero_holes(st_ops->value_type, value); + if (err) + return err; + + uvalue = (struct bpf_struct_ops_value *)value; + err = check_zero_holes(t, uvalue->data); + if (err) + return err; + + if (uvalue->state || refcount_read(&uvalue->refcnt)) + return -EINVAL; + + uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; + kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; + + mutex_lock(&st_map->lock); + + if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { + err = -EBUSY; + goto unlock; + } + + memcpy(uvalue, value, map->value_size); + + udata = &uvalue->data; + kdata = &kvalue->data; + image = st_map->image; + + for_each_member(i, t, member) { + const struct btf_type *mtype, *ptype; + struct bpf_prog *prog; + u32 moff; + + moff = btf_member_bit_offset(t, member) / 8; + ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); + if (ptype == module_type) { + if (*(void **)(udata + moff)) + goto reset_unlock; + *(void **)(kdata + moff) = BPF_MODULE_OWNER; + continue; + } + + err = st_ops->init_member(t, member, kdata, udata); + if (err < 0) + goto reset_unlock; + + /* The ->init_member() has handled this member */ + if (err > 0) + continue; + + /* If st_ops->init_member does not handle it, + * we will only handle func ptrs and zero-ed members + * here. Reject everything else. + */ + + /* All non func ptr member must be 0 */ + if (!ptype || !btf_type_is_func_proto(ptype)) { + u32 msize; + + mtype = btf_type_by_id(btf_vmlinux, member->type); + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, + NULL, NULL); + if (IS_ERR(mtype)) { + err = PTR_ERR(mtype); + goto reset_unlock; + } + + if (memchr_inv(udata + moff, 0, msize)) { + err = -EINVAL; + goto reset_unlock; + } + + continue; + } + + prog_fd = (int)(*(unsigned long *)(udata + moff)); + /* Similar check as the attr->attach_prog_fd */ + if (!prog_fd) + continue; + + prog = bpf_prog_get(prog_fd); + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto reset_unlock; + } + st_map->progs[i] = prog; + + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || + prog->aux->attach_btf_id != st_ops->type_id || + prog->expected_attach_type != i) { + err = -EINVAL; + goto reset_unlock; + } + + err = arch_prepare_bpf_trampoline(image, + st_map->image + PAGE_SIZE, + &st_ops->func_models[i], 0, + &prog, 1, NULL, 0, NULL); + if (err < 0) + goto reset_unlock; + + *(void **)(kdata + moff) = image; + image += err; + + /* put prog_id to udata */ + *(unsigned long *)(udata + moff) = prog->aux->id; + } + + refcount_set(&kvalue->refcnt, 1); + bpf_map_inc(map); + + set_memory_ro((long)st_map->image, 1); + set_memory_x((long)st_map->image, 1); + err = st_ops->reg(kdata); + if (likely(!err)) { + /* Pair with smp_load_acquire() during lookup_elem(). + * It ensures the above udata updates (e.g. prog->aux->id) + * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. + */ + smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); + goto unlock; + } + + /* Error during st_ops->reg(). It is very unlikely since + * the above init_member() should have caught it earlier + * before reg(). The only possibility is if there was a race + * in registering the struct_ops (under the same name) to + * a sub-system through different struct_ops's maps. + */ + set_memory_nx((long)st_map->image, 1); + set_memory_rw((long)st_map->image, 1); + bpf_map_put(map); + +reset_unlock: + bpf_struct_ops_map_put_progs(st_map); + memset(uvalue, 0, map->value_size); + memset(kvalue, 0, map->value_size); +unlock: + mutex_unlock(&st_map->lock); + return err; +} + +static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) +{ + enum bpf_struct_ops_state prev_state; + struct bpf_struct_ops_map *st_map; + + st_map = (struct bpf_struct_ops_map *)map; + prev_state = cmpxchg(&st_map->kvalue.state, + BPF_STRUCT_OPS_STATE_INUSE, + BPF_STRUCT_OPS_STATE_TOBEFREE); + if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) { + st_map->st_ops->unreg(&st_map->kvalue.data); + if (refcount_dec_and_test(&st_map->kvalue.refcnt)) + bpf_map_put(map); + } + + return 0; +} + +static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void *value; + + value = bpf_struct_ops_map_lookup_elem(map, key); + if (!value) + return; + + btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id, + value, m); + seq_puts(m, "\n"); +} + +static void bpf_struct_ops_map_free(struct bpf_map *map) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + + if (st_map->progs) + bpf_struct_ops_map_put_progs(st_map); + bpf_map_area_free(st_map->progs); + bpf_jit_free_exec(st_map->image); + bpf_map_area_free(st_map->uvalue); + bpf_map_area_free(st_map); +} + +static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) +{ + if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || + attr->map_flags || !attr->btf_vmlinux_value_type_id) + return -EINVAL; + return 0; +} + +static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) +{ + const struct bpf_struct_ops *st_ops; + size_t map_total_size, st_map_size; + struct bpf_struct_ops_map *st_map; + const struct btf_type *t, *vt; + struct bpf_map_memory mem; + struct bpf_map *map; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); + if (!st_ops) + return ERR_PTR(-ENOTSUPP); + + vt = st_ops->value_type; + if (attr->value_size != vt->size) + return ERR_PTR(-EINVAL); + + t = st_ops->type; + + st_map_size = sizeof(*st_map) + + /* kvalue stores the + * struct bpf_struct_ops_tcp_congestions_ops + */ + (vt->size - sizeof(struct bpf_struct_ops_value)); + map_total_size = st_map_size + + /* uvalue */ + sizeof(vt->size) + + /* struct bpf_progs **progs */ + btf_type_vlen(t) * sizeof(struct bpf_prog *); + err = bpf_map_charge_init(&mem, map_total_size); + if (err < 0) + return ERR_PTR(err); + + st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); + if (!st_map) { + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + st_map->st_ops = st_ops; + map = &st_map->map; + + st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); + st_map->progs = + bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *), + NUMA_NO_NODE); + st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!st_map->uvalue || !st_map->progs || !st_map->image) { + bpf_struct_ops_map_free(map); + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + + mutex_init(&st_map->lock); + set_vm_flush_reset_perms(st_map->image); + bpf_map_init_from_attr(map, attr); + bpf_map_charge_move(&map->memory, &mem); + + return map; +} + +const struct bpf_map_ops bpf_struct_ops_map_ops = { + .map_alloc_check = bpf_struct_ops_map_alloc_check, + .map_alloc = bpf_struct_ops_map_alloc, + .map_free = bpf_struct_ops_map_free, + .map_get_next_key = bpf_struct_ops_map_get_next_key, + .map_lookup_elem = bpf_struct_ops_map_lookup_elem, + .map_delete_elem = bpf_struct_ops_map_delete_elem, + .map_update_elem = bpf_struct_ops_map_update_elem, + .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, +}; + +/* "const void *" because some subsystem is + * passing a const (e.g. const struct tcp_congestion_ops *) + */ +bool bpf_struct_ops_get(const void *kdata) +{ + struct bpf_struct_ops_value *kvalue; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + + return refcount_inc_not_zero(&kvalue->refcnt); +} + +void bpf_struct_ops_put(const void *kdata) +{ + struct bpf_struct_ops_value *kvalue; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + if (refcount_dec_and_test(&kvalue->refcnt)) { + struct bpf_struct_ops_map *st_map; + + st_map = container_of(kvalue, struct bpf_struct_ops_map, + kvalue); + bpf_map_put(&st_map->map); + } +} diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 12af4a1bb1a4..81d9cf75cacd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -500,13 +500,6 @@ static const char *btf_int_encoding_str(u8 encoding) return "UNKN"; } -static u32 btf_member_bit_offset(const struct btf_type *struct_type, - const struct btf_member *member) -{ - return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) - : member->offset; -} - static u32 btf_type_int(const struct btf_type *t) { return *(u32 *)(t + 1); @@ -1089,7 +1082,7 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) * *elem_type: same as return type ("struct X") * *total_nelems: 1 */ -static const struct btf_type * +const struct btf_type * btf_resolve_size(const struct btf *btf, const struct btf_type *type, u32 *type_size, const struct btf_type **elem_type, u32 *total_nelems) @@ -1143,8 +1136,10 @@ resolved: return ERR_PTR(-EINVAL); *type_size = nelems * size; - *total_nelems = nelems; - *elem_type = type; + if (total_nelems) + *total_nelems = nelems; + if (elem_type) + *elem_type = type; return array_type ? : type; } @@ -1858,7 +1853,10 @@ static void btf_modifier_seq_show(const struct btf *btf, u32 type_id, void *data, u8 bits_offset, struct seq_file *m) { - t = btf_type_id_resolve(btf, &type_id); + if (btf->resolved_ids) + t = btf_type_id_resolve(btf, &type_id); + else + t = btf_type_skip_modifiers(btf, type_id, NULL); btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); } diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 5e9366b33f0f..b3c48d1533cb 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -22,7 +22,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) */ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || - inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE || + inner_map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { fdput(f); return ERR_PTR(-ENOTSUPP); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 03a02ef4c496..f9db72a96ec0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -628,7 +628,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return ret; } -#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id +#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -642,6 +642,14 @@ static int map_create(union bpf_attr *attr) if (err) return -EINVAL; + if (attr->btf_vmlinux_value_type_id) { + if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS || + attr->btf_key_type_id || attr->btf_value_type_id) + return -EINVAL; + } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { + return -EINVAL; + } + f_flags = bpf_get_file_flag(attr->map_flags); if (f_flags < 0) return f_flags; @@ -664,32 +672,35 @@ static int map_create(union bpf_attr *attr) atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); - if (attr->btf_key_type_id || attr->btf_value_type_id) { + map->spin_lock_off = -EINVAL; + if (attr->btf_key_type_id || attr->btf_value_type_id || + /* Even the map's value is a kernel's struct, + * the bpf_prog.o must have BTF to begin with + * to figure out the corresponding kernel's + * counter part. Thus, attr->btf_fd has + * to be valid also. + */ + attr->btf_vmlinux_value_type_id) { struct btf *btf; - if (!attr->btf_value_type_id) { - err = -EINVAL; - goto free_map; - } - btf = btf_get_by_fd(attr->btf_fd); if (IS_ERR(btf)) { err = PTR_ERR(btf); goto free_map; } + map->btf = btf; - err = map_check_btf(map, btf, attr->btf_key_type_id, - attr->btf_value_type_id); - if (err) { - btf_put(btf); - goto free_map; + if (attr->btf_value_type_id) { + err = map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); + if (err) + goto free_map; } - map->btf = btf; map->btf_key_type_id = attr->btf_key_type_id; map->btf_value_type_id = attr->btf_value_type_id; - } else { - map->spin_lock_off = -EINVAL; + map->btf_vmlinux_value_type_id = + attr->btf_vmlinux_value_type_id; } err = security_bpf_map_alloc(map); @@ -888,6 +899,9 @@ static int map_lookup_elem(union bpf_attr *attr) } else if (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK) { err = map->ops->map_peek_elem(map, value); + } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + /* struct_ops map requires directly updating "value" */ + err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); } else { rcu_read_lock(); if (map->ops->map_lookup_elem_sys_only) @@ -1003,7 +1017,8 @@ static int map_update_elem(union bpf_attr *attr) goto out; } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || map->map_type == BPF_MAP_TYPE_SOCKHASH || - map->map_type == BPF_MAP_TYPE_SOCKMAP) { + map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } else if (IS_FD_PROG_ARRAY(map)) { @@ -1092,7 +1107,9 @@ static int map_delete_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_delete_elem(map, key); goto out; - } else if (IS_FD_PROG_ARRAY(map)) { + } else if (IS_FD_PROG_ARRAY(map) || + map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + /* These maps require sleepable context */ err = map->ops->map_delete_elem(map, key); goto out; } @@ -2822,6 +2839,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.btf_key_type_id = map->btf_key_type_id; info.btf_value_type_id = map->btf_value_type_id; } + info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_info_fill(&info, map); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 505f4e4b31d2..79a04417050d 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -160,11 +160,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) if (fexit_cnt) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; - err = arch_prepare_bpf_trampoline(new_image, &tr->func.model, flags, + err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, + &tr->func.model, flags, fentry, fentry_cnt, fexit, fexit_cnt, tr->func.addr); - if (err) + if (err < 0) goto out; if (tr->selector) @@ -296,7 +297,8 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) } int __weak -arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, +arch_prepare_bpf_trampoline(void *image, void *image_end, + const struct btf_func_model *m, u32 flags, struct bpf_prog **fentry_progs, int fentry_cnt, struct bpf_prog **fexit_progs, int fexit_cnt, void *orig_call) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 586ed3c94b80..f5af759a8a5f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8155,6 +8155,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + verbose(env, "bpf_struct_ops map cannot be used in prog\n"); + return -EINVAL; + } + return 0; } -- cgit v1.2.3 From 0baf26b0fcd74bbfcef53c5d5e8bad2b99c8d0d2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:35:08 -0800 Subject: bpf: tcp: Support tcp_congestion_ops in bpf This patch makes "struct tcp_congestion_ops" to be the first user of BPF STRUCT_OPS. It allows implementing a tcp_congestion_ops in bpf. The BPF implemented tcp_congestion_ops can be used like regular kernel tcp-cc through sysctl and setsockopt. e.g. [root@arch-fb-vm1 bpf]# sysctl -a | egrep congestion net.ipv4.tcp_allowed_congestion_control = reno cubic bpf_cubic net.ipv4.tcp_available_congestion_control = reno bic cubic bpf_cubic net.ipv4.tcp_congestion_control = bpf_cubic There has been attempt to move the TCP CC to the user space (e.g. CCP in TCP). The common arguments are faster turn around, get away from long-tail kernel versions in production...etc, which are legit points. BPF has been the continuous effort to join both kernel and userspace upsides together (e.g. XDP to gain the performance advantage without bypassing the kernel). The recent BPF advancements (in particular BTF-aware verifier, BPF trampoline, BPF CO-RE...) made implementing kernel struct ops (e.g. tcp cc) possible in BPF. It allows a faster turnaround for testing algorithm in the production while leveraging the existing (and continue growing) BPF feature/framework instead of building one specifically for userspace TCP CC. This patch allows write access to a few fields in tcp-sock (in bpf_tcp_ca_btf_struct_access()). The optional "get_info" is unsupported now. It can be added later. One possible way is to output the info with a btf-id to describe the content. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003508.3856115-1-kafai@fb.com --- kernel/bpf/bpf_struct_ops_types.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h index 7bb13ff49ec2..066d83ea1c99 100644 --- a/kernel/bpf/bpf_struct_ops_types.h +++ b/kernel/bpf/bpf_struct_ops_types.h @@ -1,4 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* internal file - do not include directly */ -/* To be filled in a later patch */ +#ifdef CONFIG_BPF_JIT +#ifdef CONFIG_INET +#include +BPF_STRUCT_OPS_TYPE(tcp_congestion_ops) +#endif +#endif -- cgit v1.2.3 From 8b3b54799b99de59d25a3947d539662f47300ced Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 16 Dec 2019 14:42:07 +0000 Subject: genirq: Add missing __releases() sparse annotation Add __releases() annotation to address the following sparse warning: warning: context imbalance in __irq_put_desc_unlock() - unexpected unlock Signed-off-by: Jules Irenge Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191216144208.29852-1-jbi.octave@gmail.com --- kernel/irq/irqdesc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 5b8fdd659e54..98a5f10d1900 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -891,6 +891,7 @@ __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, } void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) + __releases(&desc->lock) { raw_spin_unlock_irqrestore(&desc->lock, flags); if (bus) -- cgit v1.2.3 From 099368bb10c0e340f0b236b169e8b13235e0907c Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 16 Dec 2019 14:42:08 +0000 Subject: genirq: Add missing __must_hold() sparse annotation Add __must_hold() annotation to address the following sparse warning: warning: context imbalance in irq_wait_for_poll - unexpected unlock Signed-off-by: Jules Irenge Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191216144208.29852-2-jbi.octave@gmail.com --- kernel/irq/spurious.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 2ed97a7c9b2a..f865e5f4d382 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -34,6 +34,7 @@ static atomic_t irq_poll_active; * true and let the handler run. */ bool irq_wait_for_poll(struct irq_desc *desc) + __must_hold(&desc->lock) { if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), "irq poll in progress on cpu %d for irq %d\n", -- cgit v1.2.3 From f35deaff1b8eadb9897e4fb8b3edc7717f4ec6fa Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 7 Dec 2019 20:10:26 +0100 Subject: time/posix-stubs: Provide compat itimer supoprt for alpha Using compat_sys_getitimer and compat_sys_setitimer on alpha causes a link failure in the Alpha tinyconfig and other configurations that turn off CONFIG_POSIX_TIMERS. Use the same #ifdef check for the stub version as well. Fixes: 4c22ea2b9120 ("y2038: use compat_{get,set}_itimer on alpha") Reported-by: Guenter Roeck Reported-by: kbuild test robot Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Link: https://lore.kernel.org/r/20191207191043.656328-1-arnd@arndb.de --- kernel/time/posix-stubs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 67df65f887ac..20c65a7d4e3a 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -151,6 +151,9 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, #ifdef CONFIG_COMPAT COMPAT_SYS_NI(timer_create); +#endif + +#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) COMPAT_SYS_NI(getitimer); COMPAT_SYS_NI(setitimer); #endif -- cgit v1.2.3 From 2707745533d6d38fa7d3a2212f1fd599c3879491 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Tue, 7 Jan 2020 02:06:29 +0100 Subject: time/sched_clock: Disable interrupts in sched_clock_register() Instead of issueing a warning if sched_clock_register() is called from a context where IRQs are enabled, the code now ensures that IRQs are indeed disabled. Signed-off-by: Paul Cercueil Signed-off-by: Thomas Gleixner Acked-by: Daniel Lezcano Link: https://lore.kernel.org/r/20200107010630.954648-1-paul@crapouillou.net --- kernel/time/sched_clock.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index dbd69052eaa6..e4332e3e2d56 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -169,14 +169,15 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; - unsigned long r; + unsigned long r, flags; char r_unit; struct clock_read_data rd; if (cd.rate > rate) return; - WARN_ON(!irqs_disabled()); + /* Cannot register a sched_clock with interrupts on */ + local_irq_save(flags); /* Calculate the mult/shift to convert counter ticks to ns. */ clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); @@ -233,6 +234,8 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); + local_irq_restore(flags); + pr_debug("Registered %pS as sched_clock source\n", read); } -- cgit v1.2.3 From c475c77d5b56398303e726969e81208196b3aab3 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 6 Jan 2020 22:28:20 +0000 Subject: kunit: allow kunit tests to be loaded as a module As tests are added to kunit, it will become less feasible to execute all built tests together. By supporting modular tests we provide a simple way to do selective execution on a running system; specifying CONFIG_KUNIT=y CONFIG_KUNIT_EXAMPLE_TEST=m ...means we can simply "insmod example-test.ko" to run the tests. To achieve this we need to do the following: o export the required symbols in kunit o string-stream tests utilize non-exported symbols so for now we skip building them when CONFIG_KUNIT_TEST=m. o drivers/base/power/qos-test.c contains a few unexported interface references, namely freq_qos_read_value() and freq_constraints_init(). Both of these could be potentially defined as static inline functions in include/linux/pm_qos.h, but for now we simply avoid supporting module build for that test suite. o support a new way of declaring test suites. Because a module cannot do multiple late_initcall()s, we provide a kunit_test_suites() macro to declare multiple suites within the same module at once. o some test module names would have been too general ("test-test" and "example-test" for kunit tests, "inode-test" for ext4 tests); rename these as appropriate ("kunit-test", "kunit-example-test" and "ext4-inode-test" respectively). Also define kunit_test_suite() via kunit_test_suites() as callers in other trees may need the old definition. Co-developed-by: Knut Omang Signed-off-by: Knut Omang Signed-off-by: Alan Maguire Reviewed-by: Brendan Higgins Acked-by: Theodore Ts'o # for ext4 bits Acked-by: David Gow # For list-test Reported-by: kbuild test robot Signed-off-by: Shuah Khan --- kernel/sysctl-test.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c index 2a63241a8453..ccb78509f1a8 100644 --- a/kernel/sysctl-test.c +++ b/kernel/sysctl-test.c @@ -389,4 +389,6 @@ static struct kunit_suite sysctl_test_suite = { .test_cases = sysctl_test_cases, }; -kunit_test_suite(sysctl_test_suite); +kunit_test_suites(&sysctl_test_suite); + +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 5c0e9de06577ba7599b75f97a8bb8cc63f6cb2ad Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 10 Jan 2020 10:08:12 +0000 Subject: PM: hibernate: fix spelling mistake "shapshot" -> "snapshot" There is a spelling mistake in a pr_info message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 353c5e9070ed..befe3c94767c 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1841,7 +1841,7 @@ int hibernate_preallocate_memory(void) out: stop = ktime_get(); - pr_info("Allocated %lu pages for shapshot\n", pages); + pr_info("Allocated %lu pages for snapshot\n", pages); swsusp_show_speed(start, stop, pages, "Allocated"); return 0; -- cgit v1.2.3 From 51c39bb1d5d105a02e29aa7960f0a395086e6342 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 9 Jan 2020 22:41:20 -0800 Subject: bpf: Introduce function-by-function verification New llvm and old llvm with libbpf help produce BTF that distinguish global and static functions. Unlike arguments of static function the arguments of global functions cannot be removed or optimized away by llvm. The compiler has to use exactly the arguments specified in a function prototype. The argument type information allows the verifier validate each global function independently. For now only supported argument types are pointer to context and scalars. In the future pointers to structures, sizes, pointer to packet data can be supported as well. Consider the following example: static int f1(int ...) { ... } int f3(int b); int f2(int a) { f1(a) + f3(a); } int f3(int b) { ... } int main(...) { f1(...) + f2(...) + f3(...); } The verifier will start its safety checks from the first global function f2(). It will recursively descend into f1() because it's static. Then it will check that arguments match for the f3() invocation inside f2(). It will not descend into f3(). It will finish f2() that has to be successfully verified for all possible values of 'a'. Then it will proceed with f3(). That function also has to be safe for all possible values of 'b'. Then it will start subprog 0 (which is main() function). It will recursively descend into f1() and will skip full check of f2() and f3(), since they are global. The order of processing global functions doesn't affect safety, since all global functions must be proven safe based on their arguments only. Such function by function verification can drastically improve speed of the verification and reduce complexity. Note that the stack limit of 512 still applies to the call chain regardless whether functions were static or global. The nested level of 8 also still applies. The same recursion prevention checks are in place as well. The type information and static/global kind is preserved after the verification hence in the above example global function f2() and f3() can be replaced later by equivalent functions with the same types that are loaded and verified later without affecting safety of this main() program. Such replacement (re-linking) of global functions is a subject of future patches. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200110064124.1760511-3-ast@kernel.org --- kernel/bpf/btf.c | 175 ++++++++++++++++++++++++++++------- kernel/bpf/verifier.c | 252 ++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 346 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 81d9cf75cacd..832b5d7fd892 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2651,8 +2651,8 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_vlen(t)) { - btf_verifier_log_type(env, t, "vlen != 0"); + if (btf_type_vlen(t) > BTF_FUNC_GLOBAL) { + btf_verifier_log_type(env, t, "Invalid func linkage"); return -EINVAL; } @@ -3506,7 +3506,8 @@ static u8 bpf_ctx_convert_map[] = { static const struct btf_member * btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, - const struct btf_type *t, enum bpf_prog_type prog_type) + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg) { const struct btf_type *conv_struct; const struct btf_type *ctx_struct; @@ -3527,12 +3528,13 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, * is not supported yet. * BPF_PROG_TYPE_RAW_TRACEPOINT is fine. */ - bpf_log(log, "BPF program ctx type is not a struct\n"); + if (log->level & BPF_LOG_LEVEL) + bpf_log(log, "arg#%d type is not a struct\n", arg); return NULL; } tname = btf_name_by_offset(btf, t->name_off); if (!tname) { - bpf_log(log, "BPF program ctx struct doesn't have a name\n"); + bpf_log(log, "arg#%d struct doesn't have a name\n", arg); return NULL; } /* prog_type is valid bpf program type. No need for bounds check. */ @@ -3565,11 +3567,12 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, - enum bpf_prog_type prog_type) + enum bpf_prog_type prog_type, + int arg) { const struct btf_member *prog_ctx_type, *kern_ctx_type; - prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type); + prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg); if (!prog_ctx_type) return -ENOENT; kern_ctx_type = prog_ctx_type + 1; @@ -3731,7 +3734,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = PTR_TO_BTF_ID; if (tgt_prog) { - ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type); + ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); if (ret > 0) { info->btf_id = ret; return true; @@ -4112,11 +4115,16 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return 0; } -int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog) +/* Compare BTF of a function with given bpf_reg_state. + * Returns: + * EFAULT - there is a verifier bug. Abort verification. + * EINVAL - there is a type mismatch or BTF is not available. + * 0 - BTF matches with what bpf_reg_state expects. + * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. + */ +int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *reg) { - struct bpf_verifier_state *st = env->cur_state; - struct bpf_func_state *func = st->frame[st->curframe]; - struct bpf_reg_state *reg = func->regs; struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; struct btf *btf = prog->aux->btf; @@ -4126,27 +4134,30 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog) const char *tname; if (!prog->aux->func_info) - return 0; + return -EINVAL; btf_id = prog->aux->func_info[subprog].type_id; if (!btf_id) - return 0; + return -EFAULT; if (prog->aux->func_info_aux[subprog].unreliable) - return 0; + return -EINVAL; t = btf_type_by_id(btf, btf_id); if (!t || !btf_type_is_func(t)) { - bpf_log(log, "BTF of subprog %d doesn't point to KIND_FUNC\n", + /* These checks were already done by the verifier while loading + * struct bpf_func_info + */ + bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n", subprog); - return -EINVAL; + return -EFAULT; } tname = btf_name_by_offset(btf, t->name_off); t = btf_type_by_id(btf, t->type); if (!t || !btf_type_is_func_proto(t)) { - bpf_log(log, "Invalid type of func %s\n", tname); - return -EINVAL; + bpf_log(log, "Invalid BTF of func %s\n", tname); + return -EFAULT; } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); @@ -4172,25 +4183,127 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog) bpf_log(log, "R%d is not a pointer\n", i + 1); goto out; } - /* If program is passing PTR_TO_CTX into subprogram - * check that BTF type matches. + /* If function expects ctx type in BTF check that caller + * is passing PTR_TO_CTX. */ - if (reg[i + 1].type == PTR_TO_CTX && - !btf_get_prog_ctx_type(log, btf, t, prog->type)) - goto out; - /* All other pointers are ok */ - continue; + if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { + if (reg[i + 1].type != PTR_TO_CTX) { + bpf_log(log, + "arg#%d expected pointer to ctx, but got %s\n", + i, btf_kind_str[BTF_INFO_KIND(t->info)]); + goto out; + } + if (check_ctx_reg(env, ®[i + 1], i + 1)) + goto out; + continue; + } } - bpf_log(log, "Unrecognized argument type %s\n", - btf_kind_str[BTF_INFO_KIND(t->info)]); + bpf_log(log, "Unrecognized arg#%d type %s\n", + i, btf_kind_str[BTF_INFO_KIND(t->info)]); goto out; } return 0; out: - /* LLVM optimizations can remove arguments from static functions. */ - bpf_log(log, - "Type info disagrees with actual arguments due to compiler optimizations\n"); + /* Compiler optimizations can remove arguments from static functions + * or mismatched type can be passed into a global function. + * In such cases mark the function as unreliable from BTF point of view. + */ prog->aux->func_info_aux[subprog].unreliable = true; + return -EINVAL; +} + +/* Convert BTF of a function into bpf_reg_state if possible + * Returns: + * EFAULT - there is a verifier bug. Abort verification. + * EINVAL - cannot convert BTF. + * 0 - Successfully converted BTF into bpf_reg_state + * (either PTR_TO_CTX or SCALAR_VALUE). + */ +int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *reg) +{ + struct bpf_verifier_log *log = &env->log; + struct bpf_prog *prog = env->prog; + struct btf *btf = prog->aux->btf; + const struct btf_param *args; + const struct btf_type *t; + u32 i, nargs, btf_id; + const char *tname; + + if (!prog->aux->func_info || + prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) { + bpf_log(log, "Verifier bug\n"); + return -EFAULT; + } + + btf_id = prog->aux->func_info[subprog].type_id; + if (!btf_id) { + bpf_log(log, "Global functions need valid BTF\n"); + return -EFAULT; + } + + t = btf_type_by_id(btf, btf_id); + if (!t || !btf_type_is_func(t)) { + /* These checks were already done by the verifier while loading + * struct bpf_func_info + */ + bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n", + subprog); + return -EFAULT; + } + tname = btf_name_by_offset(btf, t->name_off); + + if (log->level & BPF_LOG_LEVEL) + bpf_log(log, "Validating %s() func#%d...\n", + tname, subprog); + + if (prog->aux->func_info_aux[subprog].unreliable) { + bpf_log(log, "Verifier bug in function %s()\n", tname); + return -EFAULT; + } + + t = btf_type_by_id(btf, t->type); + if (!t || !btf_type_is_func_proto(t)) { + bpf_log(log, "Invalid type of function %s()\n", tname); + return -EFAULT; + } + args = (const struct btf_param *)(t + 1); + nargs = btf_type_vlen(t); + if (nargs > 5) { + bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n", + tname, nargs); + return -EINVAL; + } + /* check that function returns int */ + t = btf_type_by_id(btf, t->type); + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_int(t) && !btf_type_is_enum(t)) { + bpf_log(log, + "Global function %s() doesn't return scalar. Only those are supported.\n", + tname); + return -EINVAL; + } + /* Convert BTF function arguments into verifier types. + * Only PTR_TO_CTX and SCALAR are supported atm. + */ + for (i = 0; i < nargs; i++) { + t = btf_type_by_id(btf, args[i].type); + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + if (btf_type_is_int(t) || btf_type_is_enum(t)) { + reg[i + 1].type = SCALAR_VALUE; + continue; + } + if (btf_type_is_ptr(t) && + btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { + reg[i + 1].type = PTR_TO_CTX; + continue; + } + bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", + i, btf_kind_str[BTF_INFO_KIND(t->info)], tname); + return -EINVAL; + } return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f5af759a8a5f..ca17dccc17ba 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1122,10 +1122,6 @@ static void init_reg_state(struct bpf_verifier_env *env, regs[BPF_REG_FP].type = PTR_TO_STACK; mark_reg_known_zero(env, regs, BPF_REG_FP); regs[BPF_REG_FP].frameno = state->frameno; - - /* 1st arg to a function */ - regs[BPF_REG_1].type = PTR_TO_CTX; - mark_reg_known_zero(env, regs, BPF_REG_1); } #define BPF_MAIN_FUNC (-1) @@ -2739,8 +2735,8 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -static int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +int check_ctx_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) { /* Access to ctx or passing it to a helper is only allowed in * its original, unmodified form. @@ -3956,12 +3952,26 @@ static int release_reference(struct bpf_verifier_env *env, return 0; } +static void clear_caller_saved_regs(struct bpf_verifier_env *env, + struct bpf_reg_state *regs) +{ + int i; + + /* after the call registers r0 - r5 were scratched */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + mark_reg_not_init(env, regs, caller_saved[i]); + check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); + } +} + static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_info_aux *func_info_aux; struct bpf_func_state *caller, *callee; int i, err, subprog, target_insn; + bool is_global = false; if (state->curframe + 1 >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep\n", @@ -3984,6 +3994,32 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EFAULT; } + func_info_aux = env->prog->aux->func_info_aux; + if (func_info_aux) + is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; + err = btf_check_func_arg_match(env, subprog, caller->regs); + if (err == -EFAULT) + return err; + if (is_global) { + if (err) { + verbose(env, "Caller passes invalid args into func#%d\n", + subprog); + return err; + } else { + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, + "Func#%d is global and valid. Skipping.\n", + subprog); + clear_caller_saved_regs(env, caller->regs); + + /* All global functions return SCALAR_VALUE */ + mark_reg_unknown(env, caller->regs, BPF_REG_0); + + /* continue with next insn after call */ + return 0; + } + } + callee = kzalloc(sizeof(*callee), GFP_KERNEL); if (!callee) return -ENOMEM; @@ -4010,18 +4046,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, for (i = BPF_REG_1; i <= BPF_REG_5; i++) callee->regs[i] = caller->regs[i]; - /* after the call registers r0 - r5 were scratched */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(env, caller->regs, caller_saved[i]); - check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); - } + clear_caller_saved_regs(env, caller->regs); /* only increment it after check_reg_arg() finished */ state->curframe++; - if (btf_check_func_arg_match(env, subprog)) - return -EINVAL; - /* and go analyze first insn of the callee */ *insn_idx = target_insn; @@ -6771,12 +6800,13 @@ static int check_btf_func(struct bpf_verifier_env *env, /* check type_id */ type = btf_type_by_id(btf, krecord[i].type_id); - if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) { + if (!type || !btf_type_is_func(type)) { verbose(env, "invalid type id %d in func info", krecord[i].type_id); ret = -EINVAL; goto err_free; } + info_aux[i].linkage = BTF_INFO_VLEN(type->info); prev_offset = krecord[i].insn_off; urecord += urec_size; } @@ -7756,35 +7786,13 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) static int do_check(struct bpf_verifier_env *env) { - struct bpf_verifier_state *state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; int insn_cnt = env->prog->len; bool do_print_state = false; int prev_insn_idx = -1; - env->prev_linfo = NULL; - - state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); - if (!state) - return -ENOMEM; - state->curframe = 0; - state->speculative = false; - state->branches = 1; - state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); - if (!state->frame[0]) { - kfree(state); - return -ENOMEM; - } - env->cur_state = state; - init_func_state(env, state->frame[0], - BPF_MAIN_FUNC /* callsite */, - 0 /* frameno */, - 0 /* subprogno, zero == main subprog */); - - if (btf_check_func_arg_match(env, 0)) - return -EINVAL; - for (;;) { struct bpf_insn *insn; u8 class; @@ -7862,7 +7870,7 @@ static int do_check(struct bpf_verifier_env *env) } regs = cur_regs(env); - env->insn_aux_data[env->insn_idx].seen = true; + env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; prev_insn_idx = env->insn_idx; if (class == BPF_ALU || class == BPF_ALU64) { @@ -8082,7 +8090,7 @@ process_bpf_exit: return err; env->insn_idx++; - env->insn_aux_data[env->insn_idx].seen = true; + env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; @@ -8095,7 +8103,6 @@ process_bpf_exit: env->insn_idx++; } - env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return 0; } @@ -8372,7 +8379,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); for (i = off; i < off + cnt - 1; i++) { - new_data[i].seen = true; + new_data[i].seen = env->pass_cnt; new_data[i].zext_dst = insn_has_def32(env, insn + i); } env->insn_aux_data = new_data; @@ -9484,6 +9491,7 @@ static void free_states(struct bpf_verifier_env *env) kfree(sl); sl = sln; } + env->free_list = NULL; if (!env->explored_states) return; @@ -9497,11 +9505,159 @@ static void free_states(struct bpf_verifier_env *env) kfree(sl); sl = sln; } + env->explored_states[i] = NULL; } +} - kvfree(env->explored_states); +/* The verifier is using insn_aux_data[] to store temporary data during + * verification and to store information for passes that run after the + * verification like dead code sanitization. do_check_common() for subprogram N + * may analyze many other subprograms. sanitize_insn_aux_data() clears all + * temporary data after do_check_common() finds that subprogram N cannot be + * verified independently. pass_cnt counts the number of times + * do_check_common() was run and insn->aux->seen tells the pass number + * insn_aux_data was touched. These variables are compared to clear temporary + * data from failed pass. For testing and experiments do_check_common() can be + * run multiple times even when prior attempt to verify is unsuccessful. + */ +static void sanitize_insn_aux_data(struct bpf_verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + struct bpf_insn_aux_data *aux; + int i, class; + + for (i = 0; i < env->prog->len; i++) { + class = BPF_CLASS(insn[i].code); + if (class != BPF_LDX && class != BPF_STX) + continue; + aux = &env->insn_aux_data[i]; + if (aux->seen != env->pass_cnt) + continue; + memset(aux, 0, offsetof(typeof(*aux), orig_idx)); + } } +static int do_check_common(struct bpf_verifier_env *env, int subprog) +{ + struct bpf_verifier_state *state; + struct bpf_reg_state *regs; + int ret, i; + + env->prev_linfo = NULL; + env->pass_cnt++; + + state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); + if (!state) + return -ENOMEM; + state->curframe = 0; + state->speculative = false; + state->branches = 1; + state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); + if (!state->frame[0]) { + kfree(state); + return -ENOMEM; + } + env->cur_state = state; + init_func_state(env, state->frame[0], + BPF_MAIN_FUNC /* callsite */, + 0 /* frameno */, + subprog); + + regs = state->frame[state->curframe]->regs; + if (subprog) { + ret = btf_prepare_func_args(env, subprog, regs); + if (ret) + goto out; + for (i = BPF_REG_1; i <= BPF_REG_5; i++) { + if (regs[i].type == PTR_TO_CTX) + mark_reg_known_zero(env, regs, i); + else if (regs[i].type == SCALAR_VALUE) + mark_reg_unknown(env, regs, i); + } + } else { + /* 1st arg to a function */ + regs[BPF_REG_1].type = PTR_TO_CTX; + mark_reg_known_zero(env, regs, BPF_REG_1); + ret = btf_check_func_arg_match(env, subprog, regs); + if (ret == -EFAULT) + /* unlikely verifier bug. abort. + * ret == 0 and ret < 0 are sadly acceptable for + * main() function due to backward compatibility. + * Like socket filter program may be written as: + * int bpf_prog(struct pt_regs *ctx) + * and never dereference that ctx in the program. + * 'struct pt_regs' is a type mismatch for socket + * filter that should be using 'struct __sk_buff'. + */ + goto out; + } + + ret = do_check(env); +out: + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + while (!pop_stack(env, NULL, NULL)); + free_states(env); + if (ret) + /* clean aux data in case subprog was rejected */ + sanitize_insn_aux_data(env); + return ret; +} + +/* Verify all global functions in a BPF program one by one based on their BTF. + * All global functions must pass verification. Otherwise the whole program is rejected. + * Consider: + * int bar(int); + * int foo(int f) + * { + * return bar(f); + * } + * int bar(int b) + * { + * ... + * } + * foo() will be verified first for R1=any_scalar_value. During verification it + * will be assumed that bar() already verified successfully and call to bar() + * from foo() will be checked for type match only. Later bar() will be verified + * independently to check that it's safe for R1=any_scalar_value. + */ +static int do_check_subprogs(struct bpf_verifier_env *env) +{ + struct bpf_prog_aux *aux = env->prog->aux; + int i, ret; + + if (!aux->func_info) + return 0; + + for (i = 1; i < env->subprog_cnt; i++) { + if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL) + continue; + env->insn_idx = env->subprog_info[i].start; + WARN_ON_ONCE(env->insn_idx == 0); + ret = do_check_common(env, i); + if (ret) { + return ret; + } else if (env->log.level & BPF_LOG_LEVEL) { + verbose(env, + "Func#%d is safe for any args that match its prototype\n", + i); + } + } + return 0; +} + +static int do_check_main(struct bpf_verifier_env *env) +{ + int ret; + + env->insn_idx = 0; + ret = do_check_common(env, 0); + if (!ret) + env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; + return ret; +} + + static void print_verification_stats(struct bpf_verifier_env *env) { int i; @@ -9849,18 +10005,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret < 0) goto skip_full_check; - ret = do_check(env); - if (env->cur_state) { - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - } + ret = do_check_subprogs(env); + ret = ret ?: do_check_main(env); if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux)) ret = bpf_prog_offload_finalize(env); skip_full_check: - while (!pop_stack(env, NULL, NULL)); - free_states(env); + kvfree(env->explored_states); if (ret == 0) ret = check_max_stack_depth(env); -- cgit v1.2.3 From 56de4e8f9146680bcd048a29888f7438d5e58c55 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 13 Dec 2019 13:21:30 -0500 Subject: perf: Make struct ring_buffer less ambiguous eBPF requires needing to know the size of the perf ring buffer structure. But it unfortunately has the same name as the generic ring buffer used by tracing and oprofile. To make it less ambiguous, rename the perf ring buffer structure to "perf_buffer". As other parts of the ring buffer code has "perf_" as the prefix, it only makes sense to give the ring buffer the "perf_" prefix as well. Link: https://lore.kernel.org/r/20191213153553.GE20583@krava Acked-by: Peter Zijlstra Suggested-by: Alexei Starovoitov Signed-off-by: Steven Rostedt (VMware) --- kernel/events/core.c | 42 +++++++++++++++++------------------ kernel/events/internal.h | 34 ++++++++++++++-------------- kernel/events/ring_buffer.c | 54 ++++++++++++++++++++++----------------------- 3 files changed, 65 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a1f8bde19b56..455451d24b4a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4373,7 +4373,7 @@ static void free_event_rcu(struct rcu_head *head) } static void ring_buffer_attach(struct perf_event *event, - struct ring_buffer *rb); + struct perf_buffer *rb); static void detach_sb_event(struct perf_event *event) { @@ -5054,7 +5054,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) static __poll_t perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; - struct ring_buffer *rb; + struct perf_buffer *rb; __poll_t events = EPOLLHUP; poll_wait(file, &event->waitq, wait); @@ -5296,7 +5296,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon return perf_event_set_bpf_prog(event, arg); case PERF_EVENT_IOC_PAUSE_OUTPUT: { - struct ring_buffer *rb; + struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); @@ -5432,7 +5432,7 @@ static void calc_timer_values(struct perf_event *event, static void perf_event_init_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; - struct ring_buffer *rb; + struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); @@ -5464,7 +5464,7 @@ void __weak arch_perf_update_userpage( void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; - struct ring_buffer *rb; + struct perf_buffer *rb; u64 enabled, running, now; rcu_read_lock(); @@ -5515,7 +5515,7 @@ EXPORT_SYMBOL_GPL(perf_event_update_userpage); static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) { struct perf_event *event = vmf->vma->vm_file->private_data; - struct ring_buffer *rb; + struct perf_buffer *rb; vm_fault_t ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { @@ -5548,9 +5548,9 @@ unlock: } static void ring_buffer_attach(struct perf_event *event, - struct ring_buffer *rb) + struct perf_buffer *rb) { - struct ring_buffer *old_rb = NULL; + struct perf_buffer *old_rb = NULL; unsigned long flags; if (event->rb) { @@ -5608,7 +5608,7 @@ static void ring_buffer_attach(struct perf_event *event, static void ring_buffer_wakeup(struct perf_event *event) { - struct ring_buffer *rb; + struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); @@ -5619,9 +5619,9 @@ static void ring_buffer_wakeup(struct perf_event *event) rcu_read_unlock(); } -struct ring_buffer *ring_buffer_get(struct perf_event *event) +struct perf_buffer *ring_buffer_get(struct perf_event *event) { - struct ring_buffer *rb; + struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); @@ -5634,7 +5634,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event) return rb; } -void ring_buffer_put(struct ring_buffer *rb) +void ring_buffer_put(struct perf_buffer *rb) { if (!refcount_dec_and_test(&rb->refcount)) return; @@ -5672,7 +5672,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - struct ring_buffer *rb = ring_buffer_get(event); + struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); @@ -5790,8 +5790,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) struct perf_event *event = file->private_data; unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); + struct perf_buffer *rb = NULL; unsigned long locked, lock_limit; - struct ring_buffer *rb = NULL; unsigned long vma_size; unsigned long nr_pages; long user_extra = 0, extra = 0; @@ -6266,7 +6266,7 @@ static unsigned long perf_prepare_sample_aux(struct perf_event *event, size_t size) { struct perf_event *sampler = event->aux_event; - struct ring_buffer *rb; + struct perf_buffer *rb; data->aux_size = 0; @@ -6299,7 +6299,7 @@ out: return data->aux_size; } -long perf_pmu_snapshot_aux(struct ring_buffer *rb, +long perf_pmu_snapshot_aux(struct perf_buffer *rb, struct perf_event *event, struct perf_output_handle *handle, unsigned long size) @@ -6338,8 +6338,8 @@ static void perf_aux_sample_output(struct perf_event *event, struct perf_sample_data *data) { struct perf_event *sampler = event->aux_event; + struct perf_buffer *rb; unsigned long pad; - struct ring_buffer *rb; long size; if (WARN_ON_ONCE(!sampler || !data->aux_size)) @@ -6707,7 +6707,7 @@ void perf_output_sample(struct perf_output_handle *handle, int wakeup_events = event->attr.wakeup_events; if (wakeup_events) { - struct ring_buffer *rb = handle->rb; + struct perf_buffer *rb = handle->rb; int events = local_inc_return(&rb->events); if (events >= wakeup_events) { @@ -7150,7 +7150,7 @@ void perf_event_exec(void) } struct remote_output { - struct ring_buffer *rb; + struct perf_buffer *rb; int err; }; @@ -7158,7 +7158,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) { struct perf_event *parent = event->parent; struct remote_output *ro = data; - struct ring_buffer *rb = ro->rb; + struct perf_buffer *rb = ro->rb; struct stop_event_data sd = { .event = event, }; @@ -10998,7 +10998,7 @@ err_size: static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct ring_buffer *rb = NULL; + struct perf_buffer *rb = NULL; int ret = -EINVAL; if (!output_event) diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 747d67f130cb..f16f66b6b655 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -10,7 +10,7 @@ #define RING_BUFFER_WRITABLE 0x01 -struct ring_buffer { +struct perf_buffer { refcount_t refcount; struct rcu_head rcu_head; #ifdef CONFIG_PERF_USE_VMALLOC @@ -58,17 +58,17 @@ struct ring_buffer { void *data_pages[0]; }; -extern void rb_free(struct ring_buffer *rb); +extern void rb_free(struct perf_buffer *rb); static inline void rb_free_rcu(struct rcu_head *rcu_head) { - struct ring_buffer *rb; + struct perf_buffer *rb; - rb = container_of(rcu_head, struct ring_buffer, rcu_head); + rb = container_of(rcu_head, struct perf_buffer, rcu_head); rb_free(rb); } -static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause) +static inline void rb_toggle_paused(struct perf_buffer *rb, bool pause) { if (!pause && rb->nr_pages) rb->paused = 0; @@ -76,16 +76,16 @@ static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause) rb->paused = 1; } -extern struct ring_buffer * +extern struct perf_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); -extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, +extern int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, long watermark, int flags); -extern void rb_free_aux(struct ring_buffer *rb); -extern struct ring_buffer *ring_buffer_get(struct perf_event *event); -extern void ring_buffer_put(struct ring_buffer *rb); +extern void rb_free_aux(struct perf_buffer *rb); +extern struct perf_buffer *ring_buffer_get(struct perf_event *event); +extern void ring_buffer_put(struct perf_buffer *rb); -static inline bool rb_has_aux(struct ring_buffer *rb) +static inline bool rb_has_aux(struct perf_buffer *rb) { return !!rb->aux_nr_pages; } @@ -94,7 +94,7 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head, unsigned long size, u64 flags); extern struct page * -perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff); +perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff); #ifdef CONFIG_PERF_USE_VMALLOC /* @@ -103,25 +103,25 @@ perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff); * Required for architectures that have d-cache aliasing issues. */ -static inline int page_order(struct ring_buffer *rb) +static inline int page_order(struct perf_buffer *rb) { return rb->page_order; } #else -static inline int page_order(struct ring_buffer *rb) +static inline int page_order(struct perf_buffer *rb) { return 0; } #endif -static inline unsigned long perf_data_size(struct ring_buffer *rb) +static inline unsigned long perf_data_size(struct perf_buffer *rb) { return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } -static inline unsigned long perf_aux_size(struct ring_buffer *rb) +static inline unsigned long perf_aux_size(struct perf_buffer *rb) { return rb->aux_nr_pages << PAGE_SHIFT; } @@ -141,7 +141,7 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb) buf += written; \ handle->size -= written; \ if (!handle->size) { \ - struct ring_buffer *rb = handle->rb; \ + struct perf_buffer *rb = handle->rb; \ \ handle->page++; \ handle->page &= rb->nr_pages - 1; \ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 7ffd5c763f93..192b8abc6330 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -35,7 +35,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) */ static void perf_output_get_handle(struct perf_output_handle *handle) { - struct ring_buffer *rb = handle->rb; + struct perf_buffer *rb = handle->rb; preempt_disable(); @@ -49,7 +49,7 @@ static void perf_output_get_handle(struct perf_output_handle *handle) static void perf_output_put_handle(struct perf_output_handle *handle) { - struct ring_buffer *rb = handle->rb; + struct perf_buffer *rb = handle->rb; unsigned long head; unsigned int nest; @@ -150,7 +150,7 @@ __perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, bool backward) { - struct ring_buffer *rb; + struct perf_buffer *rb; unsigned long tail, offset, head; int have_lost, page_shift; struct { @@ -301,7 +301,7 @@ void perf_output_end(struct perf_output_handle *handle) } static void -ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) +ring_buffer_init(struct perf_buffer *rb, long watermark, int flags) { long max_size = perf_data_size(rb); @@ -361,7 +361,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, { struct perf_event *output_event = event; unsigned long aux_head, aux_tail; - struct ring_buffer *rb; + struct perf_buffer *rb; unsigned int nest; if (output_event->parent) @@ -449,7 +449,7 @@ err: } EXPORT_SYMBOL_GPL(perf_aux_output_begin); -static __always_inline bool rb_need_aux_wakeup(struct ring_buffer *rb) +static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb) { if (rb->aux_overwrite) return false; @@ -475,7 +475,7 @@ static __always_inline bool rb_need_aux_wakeup(struct ring_buffer *rb) void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) { bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED); - struct ring_buffer *rb = handle->rb; + struct perf_buffer *rb = handle->rb; unsigned long aux_head; /* in overwrite mode, driver provides aux_head via handle */ @@ -532,7 +532,7 @@ EXPORT_SYMBOL_GPL(perf_aux_output_end); */ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { - struct ring_buffer *rb = handle->rb; + struct perf_buffer *rb = handle->rb; if (size > handle->size) return -ENOSPC; @@ -569,8 +569,8 @@ long perf_output_copy_aux(struct perf_output_handle *aux_handle, struct perf_output_handle *handle, unsigned long from, unsigned long to) { + struct perf_buffer *rb = aux_handle->rb; unsigned long tocopy, remainder, len = 0; - struct ring_buffer *rb = aux_handle->rb; void *addr; from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; @@ -626,7 +626,7 @@ static struct page *rb_alloc_aux_page(int node, int order) return page; } -static void rb_free_aux_page(struct ring_buffer *rb, int idx) +static void rb_free_aux_page(struct perf_buffer *rb, int idx) { struct page *page = virt_to_page(rb->aux_pages[idx]); @@ -635,7 +635,7 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx) __free_page(page); } -static void __rb_free_aux(struct ring_buffer *rb) +static void __rb_free_aux(struct perf_buffer *rb) { int pg; @@ -662,7 +662,7 @@ static void __rb_free_aux(struct ring_buffer *rb) } } -int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, +int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, long watermark, int flags) { bool overwrite = !(flags & RING_BUFFER_WRITABLE); @@ -753,7 +753,7 @@ out: return ret; } -void rb_free_aux(struct ring_buffer *rb) +void rb_free_aux(struct perf_buffer *rb) { if (refcount_dec_and_test(&rb->aux_refcount)) __rb_free_aux(rb); @@ -766,7 +766,7 @@ void rb_free_aux(struct ring_buffer *rb) */ static struct page * -__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { if (pgoff > rb->nr_pages) return NULL; @@ -798,13 +798,13 @@ static void perf_mmap_free_page(void *addr) __free_page(page); } -struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) +struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) { - struct ring_buffer *rb; + struct perf_buffer *rb; unsigned long size; int i; - size = sizeof(struct ring_buffer); + size = sizeof(struct perf_buffer); size += nr_pages * sizeof(void *); if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER) @@ -843,7 +843,7 @@ fail: return NULL; } -void rb_free(struct ring_buffer *rb) +void rb_free(struct perf_buffer *rb) { int i; @@ -854,13 +854,13 @@ void rb_free(struct ring_buffer *rb) } #else -static int data_page_nr(struct ring_buffer *rb) +static int data_page_nr(struct perf_buffer *rb) { return rb->nr_pages << page_order(rb); } static struct page * -__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { /* The '>' counts in the user page. */ if (pgoff > data_page_nr(rb)) @@ -878,11 +878,11 @@ static void perf_mmap_unmark_page(void *addr) static void rb_free_work(struct work_struct *work) { - struct ring_buffer *rb; + struct perf_buffer *rb; void *base; int i, nr; - rb = container_of(work, struct ring_buffer, work); + rb = container_of(work, struct perf_buffer, work); nr = data_page_nr(rb); base = rb->user_page; @@ -894,18 +894,18 @@ static void rb_free_work(struct work_struct *work) kfree(rb); } -void rb_free(struct ring_buffer *rb) +void rb_free(struct perf_buffer *rb) { schedule_work(&rb->work); } -struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) +struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) { - struct ring_buffer *rb; + struct perf_buffer *rb; unsigned long size; void *all_buf; - size = sizeof(struct ring_buffer); + size = sizeof(struct perf_buffer); size += sizeof(void *); rb = kzalloc(size, GFP_KERNEL); @@ -939,7 +939,7 @@ fail: #endif struct page * -perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { if (rb->aux_nr_pages) { /* above AUX space */ -- cgit v1.2.3 From 1c5eb4481e0151d579f738175497f998840f7bbc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 9 Jan 2020 18:53:48 -0500 Subject: tracing: Rename trace_buffer to array_buffer As we are working to remove the generic "ring_buffer" name that is used by both tracing and perf, the ring_buffer name for tracing will be renamed to trace_buffer, and perf's ring buffer will be renamed to perf_buffer. As there already exists a trace_buffer that is used by the trace_arrays, it needs to be first renamed to array_buffer. Link: https://lore.kernel.org/r/20191213153553.GE20583@krava Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/blktrace.c | 4 +- kernel/trace/ftrace.c | 8 +- kernel/trace/trace.c | 230 +++++++++++++++++------------------ kernel/trace/trace.h | 16 +-- kernel/trace/trace_branch.c | 4 +- kernel/trace/trace_events.c | 18 +-- kernel/trace/trace_events_hist.c | 2 +- kernel/trace/trace_functions.c | 8 +- kernel/trace/trace_functions_graph.c | 14 +-- kernel/trace/trace_hwlat.c | 2 +- kernel/trace/trace_irqsoff.c | 8 +- kernel/trace/trace_kdb.c | 8 +- kernel/trace/trace_mmiotrace.c | 12 +- kernel/trace/trace_output.c | 2 +- kernel/trace/trace_sched_wakeup.c | 20 +-- kernel/trace/trace_selftest.c | 26 ++-- kernel/trace/trace_syscalls.c | 4 +- 17 files changed, 193 insertions(+), 193 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 475e29498bca..3b926f62ed83 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -75,7 +75,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (blk_tracer) { - buffer = blk_tr->trace_buffer.buffer; + buffer = blk_tr->array_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + len + cgid_len, @@ -248,7 +248,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, if (blk_tracer) { tracing_record_cmdline(current); - buffer = blk_tr->trace_buffer.buffer; + buffer = blk_tr->array_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + pdu_len + cgid_len, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9bf1f2cd515e..3f0ae07e72ef 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -146,7 +146,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, { struct trace_array *tr = op->private; - if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid)) + if (tr && this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid)) return; op->saved_func(ip, parent_ip, op, regs); @@ -6922,7 +6922,7 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->function_pids); - this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, + this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, trace_ignore_this_task(pid_list, next)); } @@ -6976,7 +6976,7 @@ static void clear_ftrace_pids(struct trace_array *tr) unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); for_each_possible_cpu(cpu) - per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false; + per_cpu_ptr(tr->array_buffer.data, cpu)->ftrace_ignore_pid = false; rcu_assign_pointer(tr->function_pids, NULL); @@ -7100,7 +7100,7 @@ static void ignore_task_cpu(void *data) pid_list = rcu_dereference_protected(tr->function_pids, mutex_is_locked(&ftrace_lock)); - this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, + this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, trace_ignore_this_task(pid_list, current)); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ddb7e7f5fe8d..67084b7945ff 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -603,7 +603,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, return read; } -static u64 buffer_ftrace_now(struct trace_buffer *buf, int cpu) +static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu) { u64 ts; @@ -619,7 +619,7 @@ static u64 buffer_ftrace_now(struct trace_buffer *buf, int cpu) u64 ftrace_now(int cpu) { - return buffer_ftrace_now(&global_trace.trace_buffer, cpu); + return buffer_ftrace_now(&global_trace.array_buffer, cpu); } /** @@ -796,8 +796,8 @@ __trace_buffer_lock_reserve(struct ring_buffer *buffer, void tracer_tracing_on(struct trace_array *tr) { - if (tr->trace_buffer.buffer) - ring_buffer_record_on(tr->trace_buffer.buffer); + if (tr->array_buffer.buffer) + ring_buffer_record_on(tr->array_buffer.buffer); /* * This flag is looked at when buffers haven't been allocated * yet, or by some tracers (like irqsoff), that just want to @@ -865,7 +865,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) alloc = sizeof(*entry) + size + 2; /* possible \n added */ local_save_flags(irq_flags); - buffer = global_trace.trace_buffer.buffer; + buffer = global_trace.array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, irq_flags, pc); if (!event) @@ -913,7 +913,7 @@ int __trace_bputs(unsigned long ip, const char *str) return 0; local_save_flags(irq_flags); - buffer = global_trace.trace_buffer.buffer; + buffer = global_trace.array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, irq_flags, pc); if (!event) @@ -1036,9 +1036,9 @@ void *tracing_cond_snapshot_data(struct trace_array *tr) } EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); -static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, - struct trace_buffer *size_buf, int cpu_id); -static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); +static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, + struct array_buffer *size_buf, int cpu_id); +static void set_buffer_entries(struct array_buffer *buf, unsigned long val); int tracing_alloc_snapshot_instance(struct trace_array *tr) { @@ -1048,7 +1048,7 @@ int tracing_alloc_snapshot_instance(struct trace_array *tr) /* allocate spare buffer */ ret = resize_buffer_duplicate_size(&tr->max_buffer, - &tr->trace_buffer, RING_BUFFER_ALL_CPUS); + &tr->array_buffer, RING_BUFFER_ALL_CPUS); if (ret < 0) return ret; @@ -1251,8 +1251,8 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); void tracer_tracing_off(struct trace_array *tr) { - if (tr->trace_buffer.buffer) - ring_buffer_record_off(tr->trace_buffer.buffer); + if (tr->array_buffer.buffer) + ring_buffer_record_off(tr->array_buffer.buffer); /* * This flag is looked at when buffers haven't been allocated * yet, or by some tracers (like irqsoff), that just want to @@ -1294,8 +1294,8 @@ void disable_trace_on_warning(void) */ bool tracer_tracing_is_on(struct trace_array *tr) { - if (tr->trace_buffer.buffer) - return ring_buffer_record_is_on(tr->trace_buffer.buffer); + if (tr->array_buffer.buffer) + return ring_buffer_record_is_on(tr->array_buffer.buffer); return !tr->buffer_disabled; } @@ -1590,8 +1590,8 @@ void latency_fsnotify(struct trace_array *tr) static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct trace_buffer *trace_buf = &tr->trace_buffer; - struct trace_buffer *max_buf = &tr->max_buffer; + struct array_buffer *trace_buf = &tr->array_buffer; + struct array_buffer *max_buf = &tr->max_buffer; struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); @@ -1649,8 +1649,8 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, arch_spin_lock(&tr->max_lock); - /* Inherit the recordable setting from trace_buffer */ - if (ring_buffer_record_is_set_on(tr->trace_buffer.buffer)) + /* Inherit the recordable setting from array_buffer */ + if (ring_buffer_record_is_set_on(tr->array_buffer.buffer)) ring_buffer_record_on(tr->max_buffer.buffer); else ring_buffer_record_off(tr->max_buffer.buffer); @@ -1659,7 +1659,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) goto out_unlock; #endif - swap(tr->trace_buffer.buffer, tr->max_buffer.buffer); + swap(tr->array_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); @@ -1692,7 +1692,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&tr->max_lock); - ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); + ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu); if (ret == -EBUSY) { /* @@ -1718,7 +1718,7 @@ static int wait_on_pipe(struct trace_iterator *iter, int full) if (trace_buffer_iter(iter, iter->cpu_file)) return 0; - return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file, + return ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full); } @@ -1769,7 +1769,7 @@ static int run_tracer_selftest(struct tracer *type) * internal tracing to verify that everything is in order. * If we fail, we do not register this tracer. */ - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(&tr->array_buffer); tr->current_trace = type; @@ -1795,7 +1795,7 @@ static int run_tracer_selftest(struct tracer *type) return -1; } /* Only reset on passing, to avoid touching corrupted buffers */ - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { @@ -1962,7 +1962,7 @@ int __init register_tracer(struct tracer *type) return ret; } -static void tracing_reset_cpu(struct trace_buffer *buf, int cpu) +static void tracing_reset_cpu(struct array_buffer *buf, int cpu) { struct ring_buffer *buffer = buf->buffer; @@ -1978,7 +1978,7 @@ static void tracing_reset_cpu(struct trace_buffer *buf, int cpu) ring_buffer_record_enable(buffer); } -void tracing_reset_online_cpus(struct trace_buffer *buf) +void tracing_reset_online_cpus(struct array_buffer *buf) { struct ring_buffer *buffer = buf->buffer; int cpu; @@ -2008,7 +2008,7 @@ void tracing_reset_all_online_cpus(void) if (!tr->clear_trace) continue; tr->clear_trace = false; - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE tracing_reset_online_cpus(&tr->max_buffer); #endif @@ -2117,7 +2117,7 @@ void tracing_start(void) /* Prevent the buffers from switching */ arch_spin_lock(&global_trace.max_lock); - buffer = global_trace.trace_buffer.buffer; + buffer = global_trace.array_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); @@ -2156,7 +2156,7 @@ static void tracing_start_tr(struct trace_array *tr) goto out; } - buffer = tr->trace_buffer.buffer; + buffer = tr->array_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); @@ -2182,7 +2182,7 @@ void tracing_stop(void) /* Prevent the buffers from switching */ arch_spin_lock(&global_trace.max_lock); - buffer = global_trace.trace_buffer.buffer; + buffer = global_trace.array_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); @@ -2211,7 +2211,7 @@ static void tracing_stop_tr(struct trace_array *tr) if (tr->stop_count++) goto out; - buffer = tr->trace_buffer.buffer; + buffer = tr->array_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); @@ -2572,7 +2572,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, struct ring_buffer_event *entry; int val; - *current_rb = trace_file->tr->trace_buffer.buffer; + *current_rb = trace_file->tr->array_buffer.buffer; if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && @@ -2845,7 +2845,7 @@ trace_function(struct trace_array *tr, int pc) { struct trace_event_call *call = &event_function; - struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -2971,7 +2971,7 @@ static inline void ftrace_trace_stack(struct trace_array *tr, void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { - struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer *buffer = tr->array_buffer.buffer; if (rcu_is_watching()) { __ftrace_trace_stack(buffer, flags, skip, pc, NULL); @@ -3009,7 +3009,7 @@ void trace_dump_stack(int skip) /* Skip 1 to skip this function. */ skip++; #endif - __ftrace_trace_stack(global_trace.trace_buffer.buffer, + __ftrace_trace_stack(global_trace.array_buffer.buffer, flags, skip, preempt_count(), NULL); } EXPORT_SYMBOL_GPL(trace_dump_stack); @@ -3154,7 +3154,7 @@ void trace_printk_init_buffers(void) * directly here. If the global_trace.buffer is already * allocated here, then this was called by module code. */ - if (global_trace.trace_buffer.buffer) + if (global_trace.array_buffer.buffer) tracing_start_cmdline_record(); } EXPORT_SYMBOL_GPL(trace_printk_init_buffers); @@ -3217,7 +3217,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) local_save_flags(flags); size = sizeof(*entry) + sizeof(u32) * len; - buffer = tr->trace_buffer.buffer; + buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, flags, pc); if (!event) @@ -3302,7 +3302,7 @@ __printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { - return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); + return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); } __printf(3, 0) @@ -3367,7 +3367,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, if (buf_iter) event = ring_buffer_iter_peek(buf_iter, ts); else - event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts, + event = ring_buffer_peek(iter->array_buffer->buffer, cpu, ts, lost_events); if (event) { @@ -3382,7 +3382,7 @@ static struct trace_entry * __find_next_entry(struct trace_iterator *iter, int *ent_cpu, unsigned long *missing_events, u64 *ent_ts) { - struct ring_buffer *buffer = iter->trace_buffer->buffer; + struct ring_buffer *buffer = iter->array_buffer->buffer; struct trace_entry *ent, *next = NULL; unsigned long lost_events = 0, next_lost = 0; int cpu_file = iter->cpu_file; @@ -3459,7 +3459,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter) static void trace_consume(struct trace_iterator *iter) { - ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts, + ring_buffer_consume(iter->array_buffer->buffer, iter->cpu, &iter->ts, &iter->lost_events); } @@ -3497,7 +3497,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) unsigned long entries = 0; u64 ts; - per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0; + per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = 0; buf_iter = trace_buffer_iter(iter, cpu); if (!buf_iter) @@ -3511,13 +3511,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) * by the timestamp being before the start of the buffer. */ while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { - if (ts >= iter->trace_buffer->time_start) + if (ts >= iter->array_buffer->time_start) break; entries++; ring_buffer_read(buf_iter, NULL); } - per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries; + per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries; } /* @@ -3602,7 +3602,7 @@ static void s_stop(struct seq_file *m, void *p) } static void -get_total_entries_cpu(struct trace_buffer *buf, unsigned long *total, +get_total_entries_cpu(struct array_buffer *buf, unsigned long *total, unsigned long *entries, int cpu) { unsigned long count; @@ -3624,7 +3624,7 @@ get_total_entries_cpu(struct trace_buffer *buf, unsigned long *total, } static void -get_total_entries(struct trace_buffer *buf, +get_total_entries(struct array_buffer *buf, unsigned long *total, unsigned long *entries) { unsigned long t, e; @@ -3647,7 +3647,7 @@ unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu) if (!tr) tr = &global_trace; - get_total_entries_cpu(&tr->trace_buffer, &total, &entries, cpu); + get_total_entries_cpu(&tr->array_buffer, &total, &entries, cpu); return entries; } @@ -3659,7 +3659,7 @@ unsigned long trace_total_entries(struct trace_array *tr) if (!tr) tr = &global_trace; - get_total_entries(&tr->trace_buffer, &total, &entries); + get_total_entries(&tr->array_buffer, &total, &entries); return entries; } @@ -3676,7 +3676,7 @@ static void print_lat_help_header(struct seq_file *m) "# \\ / ||||| \\ | / \n"); } -static void print_event_info(struct trace_buffer *buf, struct seq_file *m) +static void print_event_info(struct array_buffer *buf, struct seq_file *m) { unsigned long total; unsigned long entries; @@ -3687,7 +3687,7 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m) seq_puts(m, "#\n"); } -static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m, +static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; @@ -3698,7 +3698,7 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m, seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); } -static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m, +static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; @@ -3720,7 +3720,7 @@ void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { unsigned long sym_flags = (global_trace.trace_flags & TRACE_ITER_SYM_MASK); - struct trace_buffer *buf = iter->trace_buffer; + struct array_buffer *buf = iter->array_buffer; struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); struct tracer *type = iter->trace; unsigned long entries; @@ -3795,7 +3795,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter) cpumask_test_cpu(iter->cpu, iter->started)) return; - if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) + if (per_cpu_ptr(iter->array_buffer->data, iter->cpu)->skipped_entries) return; if (cpumask_available(iter->started)) @@ -3929,7 +3929,7 @@ int trace_empty(struct trace_iterator *iter) if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { - if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu)) + if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu)) return 0; } return 1; @@ -3941,7 +3941,7 @@ int trace_empty(struct trace_iterator *iter) if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { - if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu)) + if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu)) return 0; } } @@ -4031,10 +4031,10 @@ void trace_default_header(struct seq_file *m) } else { if (!(trace_flags & TRACE_ITER_VERBOSE)) { if (trace_flags & TRACE_ITER_IRQ_INFO) - print_func_help_header_irq(iter->trace_buffer, + print_func_help_header_irq(iter->array_buffer, m, trace_flags); else - print_func_help_header(iter->trace_buffer, m, + print_func_help_header(iter->array_buffer, m, trace_flags); } } @@ -4192,10 +4192,10 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) #ifdef CONFIG_TRACER_MAX_TRACE /* Currently only the top directory has a snapshot */ if (tr->current_trace->print_max || snapshot) - iter->trace_buffer = &tr->max_buffer; + iter->array_buffer = &tr->max_buffer; else #endif - iter->trace_buffer = &tr->trace_buffer; + iter->array_buffer = &tr->array_buffer; iter->snapshot = snapshot; iter->pos = -1; iter->cpu_file = tracing_get_cpu(inode); @@ -4206,7 +4206,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) iter->trace->open(iter); /* Annotate start of buffers if we had overruns */ - if (ring_buffer_overruns(iter->trace_buffer->buffer)) + if (ring_buffer_overruns(iter->array_buffer->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ @@ -4220,7 +4220,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, + ring_buffer_read_prepare(iter->array_buffer->buffer, cpu, GFP_KERNEL); } ring_buffer_read_prepare_sync(); @@ -4231,7 +4231,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, + ring_buffer_read_prepare(iter->array_buffer->buffer, cpu, GFP_KERNEL); ring_buffer_read_prepare_sync(); ring_buffer_read_start(iter->buffer_iter[cpu]); @@ -4357,7 +4357,7 @@ static int tracing_open(struct inode *inode, struct file *file) /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { int cpu = tracing_get_cpu(inode); - struct trace_buffer *trace_buf = &tr->trace_buffer; + struct array_buffer *trace_buf = &tr->array_buffer; #ifdef CONFIG_TRACER_MAX_TRACE if (tr->current_trace->print_max) @@ -4578,13 +4578,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, */ if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); - ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu); + atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); + ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu); } if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); - ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); + atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); + ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu); } } arch_spin_unlock(&tr->max_lock); @@ -4726,7 +4726,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) ftrace_pid_follow_fork(tr, enabled); if (mask == TRACE_ITER_OVERWRITE) { - ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); + ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); #endif @@ -5534,11 +5534,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, int tracer_init(struct tracer *t, struct trace_array *tr) { - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(&tr->array_buffer); return t->init(tr); } -static void set_buffer_entries(struct trace_buffer *buf, unsigned long val) +static void set_buffer_entries(struct array_buffer *buf, unsigned long val) { int cpu; @@ -5548,8 +5548,8 @@ static void set_buffer_entries(struct trace_buffer *buf, unsigned long val) #ifdef CONFIG_TRACER_MAX_TRACE /* resize @tr's buffer to the size of @size_tr's entries */ -static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, - struct trace_buffer *size_buf, int cpu_id) +static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, + struct array_buffer *size_buf, int cpu_id) { int cpu, ret = 0; @@ -5587,10 +5587,10 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, ring_buffer_expanded = true; /* May be called before buffers are initialized */ - if (!tr->trace_buffer.buffer) + if (!tr->array_buffer.buffer) return 0; - ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu); + ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu); if (ret < 0) return ret; @@ -5601,8 +5601,8 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); if (ret < 0) { - int r = resize_buffer_duplicate_size(&tr->trace_buffer, - &tr->trace_buffer, cpu); + int r = resize_buffer_duplicate_size(&tr->array_buffer, + &tr->array_buffer, cpu); if (r < 0) { /* * AARGH! We are left with different @@ -5633,9 +5633,9 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, #endif /* CONFIG_TRACER_MAX_TRACE */ if (cpu == RING_BUFFER_ALL_CPUS) - set_buffer_entries(&tr->trace_buffer, size); + set_buffer_entries(&tr->array_buffer, size); else - per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size; + per_cpu_ptr(tr->array_buffer.data, cpu)->entries = size; return ret; } @@ -5979,7 +5979,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; iter->tr = tr; - iter->trace_buffer = &tr->trace_buffer; + iter->array_buffer = &tr->array_buffer; iter->cpu_file = tracing_get_cpu(inode); mutex_init(&iter->mutex); filp->private_data = iter; @@ -6039,7 +6039,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl */ return EPOLLIN | EPOLLRDNORM; else - return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file, + return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file, filp, poll_table); } @@ -6356,8 +6356,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf, for_each_tracing_cpu(cpu) { /* fill in the size from first enabled cpu */ if (size == 0) - size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries; - if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) { + size = per_cpu_ptr(tr->array_buffer.data, cpu)->entries; + if (size != per_cpu_ptr(tr->array_buffer.data, cpu)->entries) { buf_size_same = 0; break; } @@ -6373,7 +6373,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf, } else r = sprintf(buf, "X\n"); } else - r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10); + r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10); mutex_unlock(&trace_types_lock); @@ -6420,7 +6420,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { - size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10; + size += per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10; if (!ring_buffer_expanded) expanded_size += trace_buf_size >> 10; } @@ -6499,7 +6499,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt < FAULTED_SIZE) size += FAULTED_SIZE - cnt; - buffer = tr->trace_buffer.buffer; + buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, irq_flags, preempt_count()); if (unlikely(!event)) @@ -6579,7 +6579,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, if (cnt < FAULT_SIZE_ID) size += FAULT_SIZE_ID - cnt; - buffer = tr->trace_buffer.buffer; + buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, irq_flags, preempt_count()); if (!event) @@ -6634,13 +6634,13 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr) tr->clock_id = i; - ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func); + ring_buffer_set_clock(tr->array_buffer.buffer, trace_clocks[i].func); /* * New clock may not be consistent with the previous clock. * Reset the buffer so that it doesn't have incomparable timestamps. */ - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE if (tr->max_buffer.buffer) @@ -6703,7 +6703,7 @@ static int tracing_time_stamp_mode_show(struct seq_file *m, void *v) mutex_lock(&trace_types_lock); - if (ring_buffer_time_stamp_abs(tr->trace_buffer.buffer)) + if (ring_buffer_time_stamp_abs(tr->array_buffer.buffer)) seq_puts(m, "delta [absolute]\n"); else seq_puts(m, "[delta] absolute\n"); @@ -6748,7 +6748,7 @@ int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs) goto out; } - ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs); + ring_buffer_set_time_stamp_abs(tr->array_buffer.buffer, abs); #ifdef CONFIG_TRACER_MAX_TRACE if (tr->max_buffer.buffer) @@ -6797,7 +6797,7 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file) ret = 0; iter->tr = tr; - iter->trace_buffer = &tr->max_buffer; + iter->array_buffer = &tr->max_buffer; iter->cpu_file = tracing_get_cpu(inode); m->private = iter; file->private_data = m; @@ -6860,7 +6860,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, #endif if (tr->allocated_snapshot) ret = resize_buffer_duplicate_size(&tr->max_buffer, - &tr->trace_buffer, iter->cpu_file); + &tr->array_buffer, iter->cpu_file); else ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) @@ -6935,7 +6935,7 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp) } info->iter.snapshot = true; - info->iter.trace_buffer = &info->iter.tr->max_buffer; + info->iter.array_buffer = &info->iter.tr->max_buffer; return ret; } @@ -7310,7 +7310,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) info->iter.tr = tr; info->iter.cpu_file = tracing_get_cpu(inode); info->iter.trace = tr->current_trace; - info->iter.trace_buffer = &tr->trace_buffer; + info->iter.array_buffer = &tr->array_buffer; info->spare = NULL; /* Force reading ring buffer for first read */ info->read = (unsigned int)-1; @@ -7355,7 +7355,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, #endif if (!info->spare) { - info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, + info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer, iter->cpu_file); if (IS_ERR(info->spare)) { ret = PTR_ERR(info->spare); @@ -7373,7 +7373,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, again: trace_access_lock(iter->cpu_file); - ret = ring_buffer_read_page(iter->trace_buffer->buffer, + ret = ring_buffer_read_page(iter->array_buffer->buffer, &info->spare, count, iter->cpu_file, 0); @@ -7423,7 +7423,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) __trace_array_put(iter->tr); if (info->spare) - ring_buffer_free_read_page(iter->trace_buffer->buffer, + ring_buffer_free_read_page(iter->array_buffer->buffer, info->spare_cpu, info->spare); kfree(info); @@ -7528,7 +7528,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, again: trace_access_lock(iter->cpu_file); - entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); + entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) { struct page *page; @@ -7541,7 +7541,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, } refcount_set(&ref->refcount, 1); - ref->buffer = iter->trace_buffer->buffer; + ref->buffer = iter->array_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (IS_ERR(ref->page)) { ret = PTR_ERR(ref->page); @@ -7569,7 +7569,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, spd.nr_pages++; *ppos += PAGE_SIZE; - entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); + entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); } trace_access_unlock(iter->cpu_file); @@ -7613,7 +7613,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, { struct inode *inode = file_inode(filp); struct trace_array *tr = inode->i_private; - struct trace_buffer *trace_buf = &tr->trace_buffer; + struct array_buffer *trace_buf = &tr->array_buffer; int cpu = tracing_get_cpu(inode); struct trace_seq *s; unsigned long cnt; @@ -8272,7 +8272,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer *buffer = tr->array_buffer.buffer; unsigned long val; int ret; @@ -8362,7 +8362,7 @@ static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); static int -allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) +allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size) { enum ring_buffer_flags rb_flags; @@ -8382,8 +8382,8 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size } /* Allocate the first page for all buffers */ - set_buffer_entries(&tr->trace_buffer, - ring_buffer_size(tr->trace_buffer.buffer, 0)); + set_buffer_entries(&tr->array_buffer, + ring_buffer_size(tr->array_buffer.buffer, 0)); return 0; } @@ -8392,7 +8392,7 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) { int ret; - ret = allocate_trace_buffer(tr, &tr->trace_buffer, size); + ret = allocate_trace_buffer(tr, &tr->array_buffer, size); if (ret) return ret; @@ -8400,10 +8400,10 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) ret = allocate_trace_buffer(tr, &tr->max_buffer, allocate_snapshot ? size : 1); if (WARN_ON(ret)) { - ring_buffer_free(tr->trace_buffer.buffer); - tr->trace_buffer.buffer = NULL; - free_percpu(tr->trace_buffer.data); - tr->trace_buffer.data = NULL; + ring_buffer_free(tr->array_buffer.buffer); + tr->array_buffer.buffer = NULL; + free_percpu(tr->array_buffer.data); + tr->array_buffer.data = NULL; return -ENOMEM; } tr->allocated_snapshot = allocate_snapshot; @@ -8417,7 +8417,7 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) return 0; } -static void free_trace_buffer(struct trace_buffer *buf) +static void free_trace_buffer(struct array_buffer *buf) { if (buf->buffer) { ring_buffer_free(buf->buffer); @@ -8432,7 +8432,7 @@ static void free_trace_buffers(struct trace_array *tr) if (!tr) return; - free_trace_buffer(&tr->trace_buffer); + free_trace_buffer(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE free_trace_buffer(&tr->max_buffer); @@ -9036,13 +9036,13 @@ void trace_init_global_iter(struct trace_iterator *iter) iter->tr = &global_trace; iter->trace = iter->tr->current_trace; iter->cpu_file = RING_BUFFER_ALL_CPUS; - iter->trace_buffer = &global_trace.trace_buffer; + iter->array_buffer = &global_trace.array_buffer; if (iter->trace && iter->trace->open) iter->trace->open(iter); /* Annotate start of buffers if we had overruns */ - if (ring_buffer_overruns(iter->trace_buffer->buffer)) + if (ring_buffer_overruns(iter->array_buffer->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ @@ -9083,7 +9083,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); + atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); } old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -9151,7 +9151,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) tr->trace_flags |= old_userobj; for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); + atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); } atomic_dec(&dump_running); printk_nmi_direct_exit(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 63bf60f79398..fd679fe92c1f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -176,7 +176,7 @@ struct trace_array_cpu { struct tracer; struct trace_option_dentry; -struct trace_buffer { +struct array_buffer { struct trace_array *tr; struct ring_buffer *buffer; struct trace_array_cpu __percpu *data; @@ -249,7 +249,7 @@ struct cond_snapshot { struct trace_array { struct list_head list; char *name; - struct trace_buffer trace_buffer; + struct array_buffer array_buffer; #ifdef CONFIG_TRACER_MAX_TRACE /* * The max_buffer is used to snapshot the trace when a maximum @@ -257,12 +257,12 @@ struct trace_array { * Some tracers will use this to store a maximum trace while * it continues examining live traces. * - * The buffers for the max_buffer are set up the same as the trace_buffer + * The buffers for the max_buffer are set up the same as the array_buffer * When a snapshot is taken, the buffer of the max_buffer is swapped - * with the buffer of the trace_buffer and the buffers are reset for - * the trace_buffer so the tracing can continue. + * with the buffer of the array_buffer and the buffers are reset for + * the array_buffer so the tracing can continue. */ - struct trace_buffer max_buffer; + struct array_buffer max_buffer; bool allocated_snapshot; #endif #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) @@ -685,7 +685,7 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); -void tracing_reset_online_cpus(struct trace_buffer *buf); +void tracing_reset_online_cpus(struct array_buffer *buf); void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); @@ -1057,7 +1057,7 @@ struct ftrace_func_command { extern bool ftrace_filter_param __initdata; static inline int ftrace_trace_task(struct trace_array *tr) { - return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid); + return !this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid); } extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 88e158d27965..d5989284a99a 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -55,12 +55,12 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) raw_local_irq_save(flags); current->trace_recursion |= TRACE_BRANCH_BIT; - data = this_cpu_ptr(tr->trace_buffer.data); + data = this_cpu_ptr(tr->array_buffer.data); if (atomic_read(&data->disabled)) goto out; pc = preempt_count(); - buffer = tr->trace_buffer.buffer; + buffer = tr->array_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, sizeof(*entry), flags, pc); if (!event) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a5b614cc3887..ac557f685f0b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -237,7 +237,7 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) if (!pid_list) return false; - data = this_cpu_ptr(tr->trace_buffer.data); + data = this_cpu_ptr(tr->array_buffer.data); return data->ignore_pid; } @@ -546,7 +546,7 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); - this_cpu_write(tr->trace_buffer.data->ignore_pid, + this_cpu_write(tr->array_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, prev) && trace_ignore_this_task(pid_list, next)); } @@ -560,7 +560,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); - this_cpu_write(tr->trace_buffer.data->ignore_pid, + this_cpu_write(tr->array_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, next)); } @@ -571,12 +571,12 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) struct trace_pid_list *pid_list; /* Nothing to do if we are already tracing */ - if (!this_cpu_read(tr->trace_buffer.data->ignore_pid)) + if (!this_cpu_read(tr->array_buffer.data->ignore_pid)) return; pid_list = rcu_dereference_sched(tr->filtered_pids); - this_cpu_write(tr->trace_buffer.data->ignore_pid, + this_cpu_write(tr->array_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, task)); } @@ -587,13 +587,13 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) struct trace_pid_list *pid_list; /* Nothing to do if we are not tracing */ - if (this_cpu_read(tr->trace_buffer.data->ignore_pid)) + if (this_cpu_read(tr->array_buffer.data->ignore_pid)) return; pid_list = rcu_dereference_sched(tr->filtered_pids); /* Set tracing if current is enabled */ - this_cpu_write(tr->trace_buffer.data->ignore_pid, + this_cpu_write(tr->array_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, current)); } @@ -625,7 +625,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) } for_each_possible_cpu(cpu) - per_cpu_ptr(tr->trace_buffer.data, cpu)->ignore_pid = false; + per_cpu_ptr(tr->array_buffer.data, cpu)->ignore_pid = false; rcu_assign_pointer(tr->filtered_pids, NULL); @@ -1594,7 +1594,7 @@ static void ignore_task_cpu(void *data) pid_list = rcu_dereference_protected(tr->filtered_pids, mutex_is_locked(&event_mutex)); - this_cpu_write(tr->trace_buffer.data->ignore_pid, + this_cpu_write(tr->array_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, current)); } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f62de5f43e79..94c581c1a897 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -895,7 +895,7 @@ static notrace void trace_event_raw_event_synth(void *__data, * Avoid ring buffer recursion detection, as this event * is being performed within another event. */ - buffer = trace_file->tr->trace_buffer.buffer; + buffer = trace_file->tr->array_buffer.buffer; ring_buffer_nest_start(buffer); entry = trace_event_buffer_reserve(&fbuffer, trace_file, diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index b611cd36e22d..8a4c8d5c2c98 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -101,7 +101,7 @@ static int function_trace_init(struct trace_array *tr) ftrace_init_array_ops(tr, func); - tr->trace_buffer.cpu = get_cpu(); + tr->array_buffer.cpu = get_cpu(); put_cpu(); tracing_start_cmdline_record(); @@ -118,7 +118,7 @@ static void function_trace_reset(struct trace_array *tr) static void function_trace_start(struct trace_array *tr) { - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(&tr->array_buffer); } static void @@ -143,7 +143,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, goto out; cpu = smp_processor_id(); - data = per_cpu_ptr(tr->trace_buffer.data, cpu); + data = per_cpu_ptr(tr->array_buffer.data, cpu); if (!atomic_read(&data->disabled)) { local_save_flags(flags); trace_function(tr, ip, parent_ip, flags, pc); @@ -192,7 +192,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, */ local_irq_save(flags); cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->trace_buffer.data, cpu); + data = per_cpu_ptr(tr->array_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 78af97163147..79b2c2df00c5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -101,7 +101,7 @@ int __trace_graph_entry(struct trace_array *tr, { struct trace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; - struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer *buffer = tr->array_buffer.buffer; struct ftrace_graph_ent_entry *entry; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, @@ -171,7 +171,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) local_irq_save(flags); cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->trace_buffer.data, cpu); + data = per_cpu_ptr(tr->array_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); @@ -221,7 +221,7 @@ void __trace_graph_return(struct trace_array *tr, { struct trace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; - struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer *buffer = tr->array_buffer.buffer; struct ftrace_graph_ret_entry *entry; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, @@ -252,7 +252,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace) local_irq_save(flags); cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->trace_buffer.data, cpu); + data = per_cpu_ptr(tr->array_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); @@ -444,9 +444,9 @@ get_return_for_leaf(struct trace_iterator *iter, * We need to consume the current entry to see * the next one. */ - ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, + ring_buffer_consume(iter->array_buffer->buffer, iter->cpu, NULL, NULL); - event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu, + event = ring_buffer_peek(iter->array_buffer->buffer, iter->cpu, NULL, NULL); } @@ -503,7 +503,7 @@ print_graph_rel_time(struct trace_iterator *iter, struct trace_seq *s) { unsigned long long usecs; - usecs = iter->ts - iter->trace_buffer->time_start; + usecs = iter->ts - iter->array_buffer->time_start; do_div(usecs, NSEC_PER_USEC); trace_seq_printf(s, "%9llu us | ", usecs); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 6638d63f0921..fc62a6049bd3 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -104,7 +104,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) { struct trace_array *tr = hwlat_trace; struct trace_event_call *call = &event_hwlat; - struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct hwlat_entry *entry; unsigned long flags; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a745b0cee5d3..10bbb0f381d5 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -122,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr, if (!irqs_disabled_flags(*flags) && !preempt_count()) return 0; - *data = per_cpu_ptr(tr->trace_buffer.data, cpu); + *data = per_cpu_ptr(tr->array_buffer.data, cpu); disabled = atomic_inc_return(&(*data)->disabled); if (likely(disabled == 1)) @@ -167,7 +167,7 @@ static int irqsoff_display_graph(struct trace_array *tr, int set) per_cpu(tracing_cpu, cpu) = 0; tr->max_latency = 0; - tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); + tracing_reset_online_cpus(&irqsoff_trace->array_buffer); return start_irqsoff_tracer(irqsoff_trace, set); } @@ -382,7 +382,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) if (per_cpu(tracing_cpu, cpu)) return; - data = per_cpu_ptr(tr->trace_buffer.data, cpu); + data = per_cpu_ptr(tr->array_buffer.data, cpu); if (unlikely(!data) || atomic_read(&data->disabled)) return; @@ -420,7 +420,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) if (!tracer_enabled || !tracing_is_enabled()) return; - data = per_cpu_ptr(tr->trace_buffer.data, cpu); + data = per_cpu_ptr(tr->array_buffer.data, cpu); if (unlikely(!data) || !data->critical_start || atomic_read(&data->disabled)) diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index cca65044c14c..9da76104f7a2 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -43,7 +43,7 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file) if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.trace_buffer->buffer, + ring_buffer_read_prepare(iter.array_buffer->buffer, cpu, GFP_ATOMIC); ring_buffer_read_start(iter.buffer_iter[cpu]); tracing_iter_reset(&iter, cpu); @@ -51,7 +51,7 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file) } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.trace_buffer->buffer, + ring_buffer_read_prepare(iter.array_buffer->buffer, cpu_file, GFP_ATOMIC); ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); @@ -124,7 +124,7 @@ static int kdb_ftdump(int argc, const char **argv) iter.buffer_iter = buffer_iter; for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); + atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); } /* A negative skip_entries means skip all but the last entries */ @@ -139,7 +139,7 @@ static int kdb_ftdump(int argc, const char **argv) ftrace_dump_buf(skip_entries, cpu_file); for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); + atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); } kdb_trap_printk--; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index b0388016b687..c30137148759 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -32,7 +32,7 @@ static void mmio_reset_data(struct trace_array *tr) overrun_detected = false; prev_overruns = 0; - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(&tr->array_buffer); } static int mmio_trace_init(struct trace_array *tr) @@ -122,7 +122,7 @@ static void mmio_close(struct trace_iterator *iter) static unsigned long count_overruns(struct trace_iterator *iter) { unsigned long cnt = atomic_xchg(&dropped_count, 0); - unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer); + unsigned long over = ring_buffer_overruns(iter->array_buffer->buffer); if (over > prev_overruns) cnt += over - prev_overruns; @@ -297,7 +297,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct mmiotrace_rw *rw) { struct trace_event_call *call = &event_mmiotrace_rw; - struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; int pc = preempt_count(); @@ -318,7 +318,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, void mmio_trace_rw(struct mmiotrace_rw *rw) { struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id()); + struct trace_array_cpu *data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id()); __trace_mmiotrace_rw(tr, data, rw); } @@ -327,7 +327,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct mmiotrace_map *map) { struct trace_event_call *call = &event_mmiotrace_map; - struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; int pc = preempt_count(); @@ -351,7 +351,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map) struct trace_array_cpu *data; preempt_disable(); - data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id()); + data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id()); __trace_mmiotrace_map(tr, data, map); preempt_enable(); } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index d9b4b7c22db4..b4909082f6a4 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -538,7 +538,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) struct trace_array *tr = iter->tr; unsigned long verbose = tr->trace_flags & TRACE_ITER_VERBOSE; unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; - unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start; + unsigned long long abs_ts = iter->ts - iter->array_buffer->time_start; unsigned long long rel_ts = next_ts - iter->ts; struct trace_seq *s = &iter->seq; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 617e297f46dc..510fda2fcd24 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -82,7 +82,7 @@ func_prolog_preempt_disable(struct trace_array *tr, if (cpu != wakeup_current_cpu) goto out_enable; - *data = per_cpu_ptr(tr->trace_buffer.data, cpu); + *data = per_cpu_ptr(tr->array_buffer.data, cpu); disabled = atomic_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; @@ -378,7 +378,7 @@ tracing_sched_switch_trace(struct trace_array *tr, unsigned long flags, int pc) { struct trace_event_call *call = &event_context_switch; - struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -408,7 +408,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct trace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; - struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer *buffer = tr->array_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, sizeof(*entry), flags, pc); @@ -459,7 +459,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); if (likely(disabled != 1)) goto out; @@ -471,7 +471,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, goto out_unlock; /* The task we are waiting for is waking up */ - data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); + data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu); __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); @@ -494,7 +494,7 @@ out_unlock: arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: - atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); + atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); } static void __wakeup_reset(struct trace_array *tr) @@ -513,7 +513,7 @@ static void wakeup_reset(struct trace_array *tr) { unsigned long flags; - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(&tr->array_buffer); local_irq_save(flags); arch_spin_lock(&wakeup_lock); @@ -551,7 +551,7 @@ probe_wakeup(void *ignore, struct task_struct *p) return; pc = preempt_count(); - disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); if (unlikely(disabled != 1)) goto out; @@ -583,7 +583,7 @@ probe_wakeup(void *ignore, struct task_struct *p) local_save_flags(flags); - data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); + data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu); data->preempt_timestamp = ftrace_now(cpu); tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); __trace_stack(wakeup_trace, flags, 0, pc); @@ -598,7 +598,7 @@ probe_wakeup(void *ignore, struct task_struct *p) out_locked: arch_spin_unlock(&wakeup_lock); out: - atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); + atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); } static void start_wakeup_tracer(struct trace_array *tr) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 69ee8ef12cee..b5e3496cf803 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -23,7 +23,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) return 0; } -static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu) +static int trace_test_buffer_cpu(struct array_buffer *buf, int cpu) { struct ring_buffer_event *event; struct trace_entry *entry; @@ -60,7 +60,7 @@ static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu) * Test the trace buffer to see if all the elements * are still sane. */ -static int __maybe_unused trace_test_buffer(struct trace_buffer *buf, unsigned long *count) +static int __maybe_unused trace_test_buffer(struct array_buffer *buf, unsigned long *count) { unsigned long flags, cnt = 0; int cpu, ret = 0; @@ -362,7 +362,7 @@ static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, msleep(100); /* we should have nothing in the buffer */ - ret = trace_test_buffer(&tr->trace_buffer, &count); + ret = trace_test_buffer(&tr->array_buffer, &count); if (ret) goto out; @@ -383,7 +383,7 @@ static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, ftrace_enabled = 0; /* check the trace buffer */ - ret = trace_test_buffer(&tr->trace_buffer, &count); + ret = trace_test_buffer(&tr->array_buffer, &count); ftrace_enabled = 1; tracing_start(); @@ -682,7 +682,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ftrace_enabled = 0; /* check the trace buffer */ - ret = trace_test_buffer(&tr->trace_buffer, &count); + ret = trace_test_buffer(&tr->array_buffer, &count); ftrace_enabled = 1; trace->reset(tr); @@ -768,7 +768,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, * Simulate the init() callback but we attach a watchdog callback * to detect and recover from possible hangs */ - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(&tr->array_buffer); set_graph_array(tr); ret = register_ftrace_graph(&fgraph_ops); if (ret) { @@ -790,7 +790,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, tracing_stop(); /* check the trace buffer */ - ret = trace_test_buffer(&tr->trace_buffer, &count); + ret = trace_test_buffer(&tr->array_buffer, &count); /* Need to also simulate the tr->reset to remove this fgraph_ops */ tracing_stop_cmdline_record(); @@ -848,7 +848,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(&tr->trace_buffer, NULL); + ret = trace_test_buffer(&tr->array_buffer, NULL); if (!ret) ret = trace_test_buffer(&tr->max_buffer, &count); trace->reset(tr); @@ -910,7 +910,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(&tr->trace_buffer, NULL); + ret = trace_test_buffer(&tr->array_buffer, NULL); if (!ret) ret = trace_test_buffer(&tr->max_buffer, &count); trace->reset(tr); @@ -976,7 +976,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(&tr->trace_buffer, NULL); + ret = trace_test_buffer(&tr->array_buffer, NULL); if (ret) goto out; @@ -1006,7 +1006,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(&tr->trace_buffer, NULL); + ret = trace_test_buffer(&tr->array_buffer, NULL); if (ret) goto out; @@ -1136,7 +1136,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(&tr->trace_buffer, NULL); + ret = trace_test_buffer(&tr->array_buffer, NULL); if (!ret) ret = trace_test_buffer(&tr->max_buffer, &count); @@ -1177,7 +1177,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check the trace buffer */ - ret = trace_test_buffer(&tr->trace_buffer, &count); + ret = trace_test_buffer(&tr->array_buffer, &count); trace->reset(tr); tracing_start(); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 16fa218556fa..bd92843c2b0e 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -345,7 +345,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) local_save_flags(irq_flags); pc = preempt_count(); - buffer = tr->trace_buffer.buffer; + buffer = tr->array_buffer.buffer; event = trace_buffer_lock_reserve(buffer, sys_data->enter_event->event.type, size, irq_flags, pc); if (!event) @@ -391,7 +391,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) local_save_flags(irq_flags); pc = preempt_count(); - buffer = tr->trace_buffer.buffer; + buffer = tr->array_buffer.buffer; event = trace_buffer_lock_reserve(buffer, sys_data->exit_event->event.type, sizeof(*entry), irq_flags, pc); -- cgit v1.2.3 From 13292494379f92f532de71b31a54018336adc589 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 13 Dec 2019 13:58:57 -0500 Subject: tracing: Make struct ring_buffer less ambiguous As there's two struct ring_buffers in the kernel, it causes some confusion. The other one being the perf ring buffer. It was agreed upon that as neither of the ring buffers are generic enough to be used globally, they should be renamed as: perf's ring_buffer -> perf_buffer ftrace's ring_buffer -> trace_buffer This implements the changes to the ring buffer that ftrace uses. Link: https://lore.kernel.org/r/20191213140531.116b3200@gandalf.local.home Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/blktrace.c | 4 +- kernel/trace/ring_buffer.c | 124 +++++++++++++++++------------------ kernel/trace/ring_buffer_benchmark.c | 2 +- kernel/trace/trace.c | 70 ++++++++++---------- kernel/trace/trace.h | 22 +++---- kernel/trace/trace_branch.c | 2 +- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_events_hist.c | 2 +- kernel/trace/trace_functions_graph.c | 4 +- kernel/trace/trace_hwlat.c | 2 +- kernel/trace/trace_kprobe.c | 4 +- kernel/trace/trace_mmiotrace.c | 4 +- kernel/trace/trace_sched_wakeup.c | 4 +- kernel/trace/trace_syscalls.c | 4 +- kernel/trace/trace_uprobe.c | 2 +- 15 files changed, 126 insertions(+), 126 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3b926f62ed83..0735ae8545d8 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -68,7 +68,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; - struct ring_buffer *buffer = NULL; + struct trace_buffer *buffer = NULL; int pc = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; @@ -215,7 +215,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; - struct ring_buffer *buffer = NULL; + struct trace_buffer *buffer = NULL; struct blk_io_trace *t; unsigned long flags = 0; unsigned long *sequence; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3f655371eaf6..f846de2aa435 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -443,7 +443,7 @@ enum { struct ring_buffer_per_cpu { int cpu; atomic_t record_disabled; - struct ring_buffer *buffer; + struct trace_buffer *buffer; raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; @@ -482,7 +482,7 @@ struct ring_buffer_per_cpu { struct rb_irq_work irq_work; }; -struct ring_buffer { +struct trace_buffer { unsigned flags; int cpus; atomic_t record_disabled; @@ -518,7 +518,7 @@ struct ring_buffer_iter { * * Returns the number of pages used by a per_cpu buffer of the ring buffer. */ -size_t ring_buffer_nr_pages(struct ring_buffer *buffer, int cpu) +size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu) { return buffer->buffers[cpu]->nr_pages; } @@ -530,7 +530,7 @@ size_t ring_buffer_nr_pages(struct ring_buffer *buffer, int cpu) * * Returns the number of pages that have content in the ring buffer. */ -size_t ring_buffer_nr_dirty_pages(struct ring_buffer *buffer, int cpu) +size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu) { size_t read; size_t cnt; @@ -573,7 +573,7 @@ static void rb_wake_up_waiters(struct irq_work *work) * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */ -int ring_buffer_wait(struct ring_buffer *buffer, int cpu, int full) +int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) { struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer); DEFINE_WAIT(wait); @@ -684,7 +684,7 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, int full) * Returns EPOLLIN | EPOLLRDNORM if data exists in the buffers, * zero otherwise. */ -__poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, +__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table) { struct ring_buffer_per_cpu *cpu_buffer; @@ -742,13 +742,13 @@ __poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 -static inline u64 rb_time_stamp(struct ring_buffer *buffer) +static inline u64 rb_time_stamp(struct trace_buffer *buffer) { /* shift to debug/test normalization and TIME_EXTENTS */ return buffer->clock() << DEBUG_SHIFT; } -u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) +u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu) { u64 time; @@ -760,7 +760,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); -void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, +void ring_buffer_normalize_time_stamp(struct trace_buffer *buffer, int cpu, u64 *ts) { /* Just stupid testing the normalize function and deltas */ @@ -1283,7 +1283,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, } static struct ring_buffer_per_cpu * -rb_allocate_cpu_buffer(struct ring_buffer *buffer, long nr_pages, int cpu) +rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; @@ -1374,10 +1374,10 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) * when the buffer wraps. If this flag is not set, the buffer will * drop data when the tail hits the head. */ -struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, +struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { - struct ring_buffer *buffer; + struct trace_buffer *buffer; long nr_pages; int bsize; int cpu; @@ -1447,7 +1447,7 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc); * @buffer: the buffer to free. */ void -ring_buffer_free(struct ring_buffer *buffer) +ring_buffer_free(struct trace_buffer *buffer) { int cpu; @@ -1463,18 +1463,18 @@ ring_buffer_free(struct ring_buffer *buffer) } EXPORT_SYMBOL_GPL(ring_buffer_free); -void ring_buffer_set_clock(struct ring_buffer *buffer, +void ring_buffer_set_clock(struct trace_buffer *buffer, u64 (*clock)(void)) { buffer->clock = clock; } -void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs) +void ring_buffer_set_time_stamp_abs(struct trace_buffer *buffer, bool abs) { buffer->time_stamp_abs = abs; } -bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer) +bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) { return buffer->time_stamp_abs; } @@ -1712,7 +1712,7 @@ static void update_pages_handler(struct work_struct *work) * * Returns 0 on success and < 0 on failure. */ -int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, +int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, int cpu_id) { struct ring_buffer_per_cpu *cpu_buffer; @@ -1891,7 +1891,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, } EXPORT_SYMBOL_GPL(ring_buffer_resize); -void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) +void ring_buffer_change_overwrite(struct trace_buffer *buffer, int val) { mutex_lock(&buffer->mutex); if (val) @@ -2206,7 +2206,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, { struct buffer_page *tail_page = info->tail_page; struct buffer_page *commit_page = cpu_buffer->commit_page; - struct ring_buffer *buffer = cpu_buffer->buffer; + struct trace_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; @@ -2609,7 +2609,7 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, } static __always_inline void -rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) { size_t nr_pages; size_t dirty; @@ -2733,7 +2733,7 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) * Call this function before calling another ring_buffer_lock_reserve() and * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit(). */ -void ring_buffer_nest_start(struct ring_buffer *buffer) +void ring_buffer_nest_start(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; @@ -2753,7 +2753,7 @@ void ring_buffer_nest_start(struct ring_buffer *buffer) * Must be called after ring_buffer_nest_start() and after the * ring_buffer_unlock_commit(). */ -void ring_buffer_nest_end(struct ring_buffer *buffer) +void ring_buffer_nest_end(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; @@ -2775,7 +2775,7 @@ void ring_buffer_nest_end(struct ring_buffer *buffer) * * Must be paired with ring_buffer_lock_reserve. */ -int ring_buffer_unlock_commit(struct ring_buffer *buffer, +int ring_buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer; @@ -2868,7 +2868,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, } static __always_inline struct ring_buffer_event * -rb_reserve_next_event(struct ring_buffer *buffer, +rb_reserve_next_event(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) { @@ -2961,7 +2961,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, * If NULL is returned, then nothing has been allocated or locked. */ struct ring_buffer_event * -ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) +ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; @@ -3062,7 +3062,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, * If this function is called, do not call ring_buffer_unlock_commit on * the event. */ -void ring_buffer_discard_commit(struct ring_buffer *buffer, +void ring_buffer_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer; @@ -3113,7 +3113,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); * Note, like ring_buffer_lock_reserve, the length is the length of the data * and not the length of the event which would hold the header. */ -int ring_buffer_write(struct ring_buffer *buffer, +int ring_buffer_write(struct trace_buffer *buffer, unsigned long length, void *data) { @@ -3193,7 +3193,7 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) * * The caller should call synchronize_rcu() after this. */ -void ring_buffer_record_disable(struct ring_buffer *buffer) +void ring_buffer_record_disable(struct trace_buffer *buffer) { atomic_inc(&buffer->record_disabled); } @@ -3206,7 +3206,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable); * Note, multiple disables will need the same number of enables * to truly enable the writing (much like preempt_disable). */ -void ring_buffer_record_enable(struct ring_buffer *buffer) +void ring_buffer_record_enable(struct trace_buffer *buffer) { atomic_dec(&buffer->record_disabled); } @@ -3223,7 +3223,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable); * it works like an on/off switch, where as the disable() version * must be paired with a enable(). */ -void ring_buffer_record_off(struct ring_buffer *buffer) +void ring_buffer_record_off(struct trace_buffer *buffer) { unsigned int rd; unsigned int new_rd; @@ -3246,7 +3246,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_off); * it works like an on/off switch, where as the enable() version * must be paired with a disable(). */ -void ring_buffer_record_on(struct ring_buffer *buffer) +void ring_buffer_record_on(struct trace_buffer *buffer) { unsigned int rd; unsigned int new_rd; @@ -3264,7 +3264,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_on); * * Returns true if the ring buffer is in a state that it accepts writes. */ -bool ring_buffer_record_is_on(struct ring_buffer *buffer) +bool ring_buffer_record_is_on(struct trace_buffer *buffer) { return !atomic_read(&buffer->record_disabled); } @@ -3280,7 +3280,7 @@ bool ring_buffer_record_is_on(struct ring_buffer *buffer) * ring_buffer_record_disable(), as that is a temporary disabling of * the ring buffer. */ -bool ring_buffer_record_is_set_on(struct ring_buffer *buffer) +bool ring_buffer_record_is_set_on(struct trace_buffer *buffer) { return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF); } @@ -3295,7 +3295,7 @@ bool ring_buffer_record_is_set_on(struct ring_buffer *buffer) * * The caller should call synchronize_rcu() after this. */ -void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) +void ring_buffer_record_disable_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; @@ -3315,7 +3315,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); * Note, multiple disables will need the same number of enables * to truly enable the writing (much like preempt_disable). */ -void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) +void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; @@ -3345,7 +3345,7 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ -u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) +u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu) { unsigned long flags; struct ring_buffer_per_cpu *cpu_buffer; @@ -3378,7 +3378,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ -unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu) +unsigned long ring_buffer_bytes_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; @@ -3398,7 +3398,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu); * @buffer: The ring buffer * @cpu: The per CPU buffer to get the entries from. */ -unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) +unsigned long ring_buffer_entries_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; @@ -3417,7 +3417,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ -unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) +unsigned long ring_buffer_overrun_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; @@ -3440,7 +3440,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); * @cpu: The per CPU buffer to get the number of overruns from */ unsigned long -ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) +ring_buffer_commit_overrun_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; @@ -3462,7 +3462,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); * @cpu: The per CPU buffer to get the number of overruns from */ unsigned long -ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu) +ring_buffer_dropped_events_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; @@ -3483,7 +3483,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); * @cpu: The per CPU buffer to get the number of events read */ unsigned long -ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu) +ring_buffer_read_events_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; @@ -3502,7 +3502,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu); * Returns the total number of entries in the ring buffer * (all CPU entries) */ -unsigned long ring_buffer_entries(struct ring_buffer *buffer) +unsigned long ring_buffer_entries(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long entries = 0; @@ -3525,7 +3525,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries); * Returns the total number of overruns in the ring buffer * (all CPU entries) */ -unsigned long ring_buffer_overruns(struct ring_buffer *buffer) +unsigned long ring_buffer_overruns(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long overruns = 0; @@ -3949,7 +3949,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_peek); static struct ring_buffer_event * rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) { - struct ring_buffer *buffer; + struct trace_buffer *buffer; struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; int nr_loops = 0; @@ -4077,7 +4077,7 @@ rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked) * not consume the data. */ struct ring_buffer_event * -ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, +ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; @@ -4141,7 +4141,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) * and eventually empty the ring buffer if the producer is slower. */ struct ring_buffer_event * -ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, +ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_per_cpu *cpu_buffer; @@ -4201,7 +4201,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * This overall must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * -ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags) +ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; @@ -4332,7 +4332,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read); * ring_buffer_size - return the size of the ring buffer (in bytes) * @buffer: The ring buffer. */ -unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu) +unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) { /* * Earlier, this method returned @@ -4398,7 +4398,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) * @buffer: The ring buffer to reset a per cpu buffer of * @cpu: The CPU buffer to be reset */ -void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) +void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; unsigned long flags; @@ -4435,7 +4435,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); * ring_buffer_reset - reset a ring buffer * @buffer: The ring buffer to reset all cpu buffers */ -void ring_buffer_reset(struct ring_buffer *buffer) +void ring_buffer_reset(struct trace_buffer *buffer) { int cpu; @@ -4448,7 +4448,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset); * rind_buffer_empty - is the ring buffer empty? * @buffer: The ring buffer to test */ -bool ring_buffer_empty(struct ring_buffer *buffer) +bool ring_buffer_empty(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; @@ -4478,7 +4478,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty); * @buffer: The ring buffer * @cpu: The CPU buffer to test */ -bool ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) +bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; @@ -4510,8 +4510,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); * it is expected that the tracer handles the cpu buffer not being * used at the moment. */ -int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, - struct ring_buffer *buffer_b, int cpu) +int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, + struct trace_buffer *buffer_b, int cpu) { struct ring_buffer_per_cpu *cpu_buffer_a; struct ring_buffer_per_cpu *cpu_buffer_b; @@ -4590,7 +4590,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); * Returns: * The page allocated, or ERR_PTR */ -void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) +void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_page *bpage = NULL; @@ -4637,7 +4637,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); * * Free a page allocated from ring_buffer_alloc_read_page. */ -void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) +void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct buffer_data_page *bpage = data; @@ -4697,7 +4697,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * >=0 if data has been transferred, returns the offset of consumed data. * <0 if no data has been transferred. */ -int ring_buffer_read_page(struct ring_buffer *buffer, +int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page, size_t len, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; @@ -4868,12 +4868,12 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_page); */ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node) { - struct ring_buffer *buffer; + struct trace_buffer *buffer; long nr_pages_same; int cpu_i; unsigned long nr_pages; - buffer = container_of(node, struct ring_buffer, node); + buffer = container_of(node, struct trace_buffer, node); if (cpumask_test_cpu(cpu, buffer->cpumask)) return 0; @@ -4923,7 +4923,7 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node) static struct task_struct *rb_threads[NR_CPUS] __initdata; struct rb_test_data { - struct ring_buffer *buffer; + struct trace_buffer *buffer; unsigned long events; unsigned long bytes_written; unsigned long bytes_alloc; @@ -5065,7 +5065,7 @@ static __init int rb_hammer_test(void *arg) static __init int test_ringbuffer(void) { struct task_struct *rb_hammer; - struct ring_buffer *buffer; + struct trace_buffer *buffer; int cpu; int ret = 0; diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 32149e46551c..8df0aa810950 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -29,7 +29,7 @@ static int reader_finish; static DECLARE_COMPLETION(read_start); static DECLARE_COMPLETION(read_done); -static struct ring_buffer *buffer; +static struct trace_buffer *buffer; static struct task_struct *producer; static struct task_struct *consumer; static unsigned long read; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 67084b7945ff..b4a07d7ed82a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -163,7 +163,7 @@ static union trace_eval_map_item *trace_eval_maps; #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ static int tracing_set_tracer(struct trace_array *tr, const char *buf); -static void ftrace_trace_userstack(struct ring_buffer *buffer, +static void ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc); #define MAX_TRACER_SIZE 100 @@ -338,7 +338,7 @@ int tracing_check_open_get_tr(struct trace_array *tr) } int call_filter_check_discard(struct trace_event_call *call, void *rec, - struct ring_buffer *buffer, + struct trace_buffer *buffer, struct ring_buffer_event *event) { if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && @@ -747,22 +747,22 @@ static inline void trace_access_lock_init(void) #endif #ifdef CONFIG_STACKTRACE -static void __ftrace_trace_stack(struct ring_buffer *buffer, +static void __ftrace_trace_stack(struct trace_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs); static inline void ftrace_trace_stack(struct trace_array *tr, - struct ring_buffer *buffer, + struct trace_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs); #else -static inline void __ftrace_trace_stack(struct ring_buffer *buffer, +static inline void __ftrace_trace_stack(struct trace_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) { } static inline void ftrace_trace_stack(struct trace_array *tr, - struct ring_buffer *buffer, + struct trace_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) { @@ -780,7 +780,7 @@ trace_event_setup(struct ring_buffer_event *event, } static __always_inline struct ring_buffer_event * -__trace_buffer_lock_reserve(struct ring_buffer *buffer, +__trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, unsigned long flags, int pc) @@ -825,7 +825,7 @@ EXPORT_SYMBOL_GPL(tracing_on); static __always_inline void -__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) +__buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { __this_cpu_write(trace_taskinfo_save, true); @@ -848,7 +848,7 @@ __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *eve int __trace_puts(unsigned long ip, const char *str, int size) { struct ring_buffer_event *event; - struct ring_buffer *buffer; + struct trace_buffer *buffer; struct print_entry *entry; unsigned long irq_flags; int alloc; @@ -898,7 +898,7 @@ EXPORT_SYMBOL_GPL(__trace_puts); int __trace_bputs(unsigned long ip, const char *str) { struct ring_buffer_event *event; - struct ring_buffer *buffer; + struct trace_buffer *buffer; struct bputs_entry *entry; unsigned long irq_flags; int size = sizeof(struct bputs_entry); @@ -1964,7 +1964,7 @@ int __init register_tracer(struct tracer *type) static void tracing_reset_cpu(struct array_buffer *buf, int cpu) { - struct ring_buffer *buffer = buf->buffer; + struct trace_buffer *buffer = buf->buffer; if (!buffer) return; @@ -1980,7 +1980,7 @@ static void tracing_reset_cpu(struct array_buffer *buf, int cpu) void tracing_reset_online_cpus(struct array_buffer *buf) { - struct ring_buffer *buffer = buf->buffer; + struct trace_buffer *buffer = buf->buffer; int cpu; if (!buffer) @@ -2098,7 +2098,7 @@ int is_tracing_stopped(void) */ void tracing_start(void) { - struct ring_buffer *buffer; + struct trace_buffer *buffer; unsigned long flags; if (tracing_disabled) @@ -2135,7 +2135,7 @@ void tracing_start(void) static void tracing_start_tr(struct trace_array *tr) { - struct ring_buffer *buffer; + struct trace_buffer *buffer; unsigned long flags; if (tracing_disabled) @@ -2172,7 +2172,7 @@ static void tracing_start_tr(struct trace_array *tr) */ void tracing_stop(void) { - struct ring_buffer *buffer; + struct trace_buffer *buffer; unsigned long flags; raw_spin_lock_irqsave(&global_trace.start_lock, flags); @@ -2200,7 +2200,7 @@ void tracing_stop(void) static void tracing_stop_tr(struct trace_array *tr) { - struct ring_buffer *buffer; + struct trace_buffer *buffer; unsigned long flags; /* If global, we need to also stop the max tracer */ @@ -2442,7 +2442,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned short type, EXPORT_SYMBOL_GPL(tracing_generic_entry_update); struct ring_buffer_event * -trace_buffer_lock_reserve(struct ring_buffer *buffer, +trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, unsigned long flags, int pc) @@ -2561,10 +2561,10 @@ void trace_buffered_event_disable(void) preempt_enable(); } -static struct ring_buffer *temp_buffer; +static struct trace_buffer *temp_buffer; struct ring_buffer_event * -trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, +trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, struct trace_event_file *trace_file, int type, unsigned long len, unsigned long flags, int pc) @@ -2689,7 +2689,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit); # define STACK_SKIP 3 void trace_buffer_unlock_commit_regs(struct trace_array *tr, - struct ring_buffer *buffer, + struct trace_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc, struct pt_regs *regs) @@ -2710,7 +2710,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, * Similar to trace_buffer_unlock_commit_regs() but do not dump stack. */ void -trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer, +trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, struct ring_buffer_event *event) { __buffer_unlock_commit(buffer, event); @@ -2845,7 +2845,7 @@ trace_function(struct trace_array *tr, int pc) { struct trace_event_call *call = &event_function; - struct ring_buffer *buffer = tr->array_buffer.buffer; + struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -2883,7 +2883,7 @@ struct ftrace_stacks { static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); static DEFINE_PER_CPU(int, ftrace_stack_reserve); -static void __ftrace_trace_stack(struct ring_buffer *buffer, +static void __ftrace_trace_stack(struct trace_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) { @@ -2958,7 +2958,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, } static inline void ftrace_trace_stack(struct trace_array *tr, - struct ring_buffer *buffer, + struct trace_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) { @@ -2971,7 +2971,7 @@ static inline void ftrace_trace_stack(struct trace_array *tr, void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { - struct ring_buffer *buffer = tr->array_buffer.buffer; + struct trace_buffer *buffer = tr->array_buffer.buffer; if (rcu_is_watching()) { __ftrace_trace_stack(buffer, flags, skip, pc, NULL); @@ -3018,7 +3018,7 @@ EXPORT_SYMBOL_GPL(trace_dump_stack); static DEFINE_PER_CPU(int, user_stack_count); static void -ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) +ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc) { struct trace_event_call *call = &event_user_stack; struct ring_buffer_event *event; @@ -3063,7 +3063,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) preempt_enable(); } #else /* CONFIG_USER_STACKTRACE_SUPPORT */ -static void ftrace_trace_userstack(struct ring_buffer *buffer, +static void ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc) { } @@ -3188,7 +3188,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { struct trace_event_call *call = &event_bprint; struct ring_buffer_event *event; - struct ring_buffer *buffer; + struct trace_buffer *buffer; struct trace_array *tr = &global_trace; struct bprint_entry *entry; unsigned long flags; @@ -3245,7 +3245,7 @@ EXPORT_SYMBOL_GPL(trace_vbprintk); __printf(3, 0) static int -__trace_array_vprintk(struct ring_buffer *buffer, +__trace_array_vprintk(struct trace_buffer *buffer, unsigned long ip, const char *fmt, va_list args) { struct trace_event_call *call = &event_print; @@ -3326,7 +3326,7 @@ int trace_array_printk(struct trace_array *tr, EXPORT_SYMBOL_GPL(trace_array_printk); __printf(3, 4) -int trace_array_printk_buf(struct ring_buffer *buffer, +int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...) { int ret; @@ -3382,7 +3382,7 @@ static struct trace_entry * __find_next_entry(struct trace_iterator *iter, int *ent_cpu, unsigned long *missing_events, u64 *ent_ts) { - struct ring_buffer *buffer = iter->array_buffer->buffer; + struct trace_buffer *buffer = iter->array_buffer->buffer; struct trace_entry *ent, *next = NULL; unsigned long lost_events = 0, next_lost = 0; int cpu_file = iter->cpu_file; @@ -6470,7 +6470,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; enum event_trigger_type tt = ETT_NONE; - struct ring_buffer *buffer; + struct trace_buffer *buffer; struct print_entry *entry; unsigned long irq_flags; ssize_t written; @@ -6550,7 +6550,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, { struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; - struct ring_buffer *buffer; + struct trace_buffer *buffer; struct raw_data_entry *entry; unsigned long irq_flags; ssize_t written; @@ -7433,7 +7433,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) } struct buffer_ref { - struct ring_buffer *buffer; + struct trace_buffer *buffer; void *page; int cpu; refcount_t refcount; @@ -8272,7 +8272,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - struct ring_buffer *buffer = tr->array_buffer.buffer; + struct trace_buffer *buffer = tr->array_buffer.buffer; unsigned long val; int ret; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fd679fe92c1f..4812a36affac 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -178,7 +178,7 @@ struct trace_option_dentry; struct array_buffer { struct trace_array *tr; - struct ring_buffer *buffer; + struct trace_buffer *buffer; struct trace_array_cpu __percpu *data; u64 time_start; int cpu; @@ -705,7 +705,7 @@ struct dentry *tracing_init_dentry(void); struct ring_buffer_event; struct ring_buffer_event * -trace_buffer_lock_reserve(struct ring_buffer *buffer, +trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, unsigned long flags, @@ -717,7 +717,7 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); -void trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer, +void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, struct ring_buffer_event *event); int trace_empty(struct trace_iterator *iter); @@ -873,7 +873,7 @@ trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args); -int trace_array_printk_buf(struct ring_buffer *buffer, +int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); enum print_line_t print_trace_line(struct trace_iterator *iter); @@ -1367,17 +1367,17 @@ struct trace_subsystem_dir { }; extern int call_filter_check_discard(struct trace_event_call *call, void *rec, - struct ring_buffer *buffer, + struct trace_buffer *buffer, struct ring_buffer_event *event); void trace_buffer_unlock_commit_regs(struct trace_array *tr, - struct ring_buffer *buffer, + struct trace_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc, struct pt_regs *regs); static inline void trace_buffer_unlock_commit(struct trace_array *tr, - struct ring_buffer *buffer, + struct trace_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc) { @@ -1390,7 +1390,7 @@ void trace_buffered_event_disable(void); void trace_buffered_event_enable(void); static inline void -__trace_event_discard_commit(struct ring_buffer *buffer, +__trace_event_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { if (this_cpu_read(trace_buffered_event) == event) { @@ -1416,7 +1416,7 @@ __trace_event_discard_commit(struct ring_buffer *buffer, */ static inline bool __event_trigger_test_discard(struct trace_event_file *file, - struct ring_buffer *buffer, + struct trace_buffer *buffer, struct ring_buffer_event *event, void *entry, enum event_trigger_type *tt) @@ -1451,7 +1451,7 @@ __event_trigger_test_discard(struct trace_event_file *file, */ static inline void event_trigger_unlock_commit(struct trace_event_file *file, - struct ring_buffer *buffer, + struct trace_buffer *buffer, struct ring_buffer_event *event, void *entry, unsigned long irq_flags, int pc) { @@ -1482,7 +1482,7 @@ event_trigger_unlock_commit(struct trace_event_file *file, */ static inline void event_trigger_unlock_commit_regs(struct trace_event_file *file, - struct ring_buffer *buffer, + struct trace_buffer *buffer, struct ring_buffer_event *event, void *entry, unsigned long irq_flags, int pc, struct pt_regs *regs) diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index d5989284a99a..eff099123aa2 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -32,10 +32,10 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) { struct trace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; + struct trace_buffer *buffer; struct trace_array_cpu *data; struct ring_buffer_event *event; struct trace_branch *entry; - struct ring_buffer *buffer; unsigned long flags; int pc; const char *p; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ac557f685f0b..a16d1b601c5c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3391,8 +3391,8 @@ static void __init function_test_events_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { + struct trace_buffer *buffer; struct ring_buffer_event *event; - struct ring_buffer *buffer; struct ftrace_entry *entry; unsigned long flags; long disabled; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 94c581c1a897..0454abaeb486 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -879,7 +879,7 @@ static notrace void trace_event_raw_event_synth(void *__data, struct trace_event_file *trace_file = __data; struct synth_trace_event *entry; struct trace_event_buffer fbuffer; - struct ring_buffer *buffer; + struct trace_buffer *buffer; struct synth_event *event; unsigned int i, n_u64; int fields_size = 0; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 79b2c2df00c5..7d71546ba00a 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -101,7 +101,7 @@ int __trace_graph_entry(struct trace_array *tr, { struct trace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; - struct ring_buffer *buffer = tr->array_buffer.buffer; + struct trace_buffer *buffer = tr->array_buffer.buffer; struct ftrace_graph_ent_entry *entry; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, @@ -221,7 +221,7 @@ void __trace_graph_return(struct trace_array *tr, { struct trace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; - struct ring_buffer *buffer = tr->array_buffer.buffer; + struct trace_buffer *buffer = tr->array_buffer.buffer; struct ftrace_graph_ret_entry *entry; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index fc62a6049bd3..b44446bf0872 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -104,7 +104,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) { struct trace_array *tr = hwlat_trace; struct trace_event_call *call = &event_hwlat; - struct ring_buffer *buffer = tr->array_buffer.buffer; + struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct hwlat_entry *entry; unsigned long flags; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7f890262c8a3..477b6b011e7d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1175,8 +1175,8 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, struct trace_event_file *trace_file) { struct kprobe_trace_entry_head *entry; + struct trace_buffer *buffer; struct ring_buffer_event *event; - struct ring_buffer *buffer; int size, dsize, pc; unsigned long irq_flags; struct trace_event_call *call = trace_probe_event_call(&tk->tp); @@ -1223,8 +1223,8 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct trace_event_file *trace_file) { struct kretprobe_trace_entry_head *entry; + struct trace_buffer *buffer; struct ring_buffer_event *event; - struct ring_buffer *buffer; int size, pc, dsize; unsigned long irq_flags; struct trace_event_call *call = trace_probe_event_call(&tk->tp); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index c30137148759..84582bf1ed5f 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -297,7 +297,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct mmiotrace_rw *rw) { struct trace_event_call *call = &event_mmiotrace_rw; - struct ring_buffer *buffer = tr->array_buffer.buffer; + struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; int pc = preempt_count(); @@ -327,7 +327,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct mmiotrace_map *map) { struct trace_event_call *call = &event_mmiotrace_map; - struct ring_buffer *buffer = tr->array_buffer.buffer; + struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; int pc = preempt_count(); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 510fda2fcd24..97b10bb31a1f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -378,7 +378,7 @@ tracing_sched_switch_trace(struct trace_array *tr, unsigned long flags, int pc) { struct trace_event_call *call = &event_context_switch; - struct ring_buffer *buffer = tr->array_buffer.buffer; + struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -408,7 +408,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct trace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; - struct ring_buffer *buffer = tr->array_buffer.buffer; + struct trace_buffer *buffer = tr->array_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, sizeof(*entry), flags, pc); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index bd92843c2b0e..837ad4818bb4 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -317,7 +317,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; - struct ring_buffer *buffer; + struct trace_buffer *buffer; unsigned long irq_flags; unsigned long args[6]; int pc; @@ -367,7 +367,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; - struct ring_buffer *buffer; + struct trace_buffer *buffer; unsigned long irq_flags; int pc; int syscall_nr; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 352073d36585..6c75d94f5c2f 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -938,8 +938,8 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; + struct trace_buffer *buffer; struct ring_buffer_event *event; - struct ring_buffer *buffer; void *data; int size, esize; struct trace_event_call *call = trace_probe_event_call(&tu->tp); -- cgit v1.2.3 From d8d0c245a7fdd176e2cf6317b3fddda650059d06 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 11 Jan 2020 01:05:18 +0900 Subject: tracing: Apply soft-disabled and filter to tracepoints printk Apply soft-disabled and the filter rule of the trace events to the printk output of tracepoints (a.k.a. tp_printk kernel parameter) as same as trace buffer output. Link: http://lkml.kernel.org/r/157867231876.17873.15825819592284704068.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b4a07d7ed82a..b4294eb020f8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2610,6 +2610,7 @@ static DEFINE_MUTEX(tracepoint_printk_mutex); static void output_printk(struct trace_event_buffer *fbuffer) { struct trace_event_call *event_call; + struct trace_event_file *file; struct trace_event *event; unsigned long flags; struct trace_iterator *iter = tracepoint_print_iter; @@ -2623,6 +2624,12 @@ static void output_printk(struct trace_event_buffer *fbuffer) !event_call->event.funcs->trace) return; + file = fbuffer->trace_file; + if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || + (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && + !filter_match_preds(file->filter, fbuffer->entry))) + return; + event = &fbuffer->trace_file->event_call->event; spin_lock_irqsave(&tracepoint_iter_lock, flags); -- cgit v1.2.3 From 8cfcf15503f607e9597de19afeaa621897ae397e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 11 Jan 2020 01:05:31 +0900 Subject: tracing: kprobes: Output kprobe event to printk buffer Since kprobe-events use event_trigger_unlock_commit_regs() directly, that events doesn't show up in printk buffer if "tp_printk" is set. Use trace_event_buffer_commit() in kprobe events so that it can invoke output_printk() as same as other trace events. Link: http://lkml.kernel.org/r/157867233085.17873.5210928676787339604.stgit@devnote2 Signed-off-by: Masami Hiramatsu [ Adjusted data var declaration placement in __kretprobe_trace_func() ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 ++-- kernel/trace/trace_events.c | 1 + kernel/trace/trace_kprobe.c | 57 ++++++++++++++++++++++----------------------- 3 files changed, 31 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b4294eb020f8..cb850d2c4bfa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2680,9 +2680,9 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) if (static_key_false(&tracepoint_printk_key.key)) output_printk(fbuffer); - event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer, + event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer, fbuffer->event, fbuffer->entry, - fbuffer->flags, fbuffer->pc); + fbuffer->flags, fbuffer->pc, fbuffer->regs); } EXPORT_SYMBOL_GPL(trace_event_buffer_commit); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a16d1b601c5c..dfb736a964d6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -272,6 +272,7 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, if (!fbuffer->event) return NULL; + fbuffer->regs = NULL; fbuffer->entry = ring_buffer_event_data(fbuffer->event); return fbuffer->entry; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 477b6b011e7d..33a6a661904b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1175,35 +1175,35 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, struct trace_event_file *trace_file) { struct kprobe_trace_entry_head *entry; - struct trace_buffer *buffer; - struct ring_buffer_event *event; - int size, dsize, pc; - unsigned long irq_flags; struct trace_event_call *call = trace_probe_event_call(&tk->tp); + struct trace_event_buffer fbuffer; + int dsize; WARN_ON(call != trace_file->event_call); if (trace_trigger_soft_disabled(trace_file)) return; - local_save_flags(irq_flags); - pc = preempt_count(); + local_save_flags(fbuffer.flags); + fbuffer.pc = preempt_count(); + fbuffer.trace_file = trace_file; dsize = __get_data_size(&tk->tp, regs); - size = sizeof(*entry) + tk->tp.size + dsize; - event = trace_event_buffer_lock_reserve(&buffer, trace_file, - call->event.type, - size, irq_flags, pc); - if (!event) + fbuffer.event = + trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, + call->event.type, + sizeof(*entry) + tk->tp.size + dsize, + fbuffer.flags, fbuffer.pc); + if (!fbuffer.event) return; - entry = ring_buffer_event_data(event); + fbuffer.regs = regs; + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->ip = (unsigned long)tk->rp.kp.addr; store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); - event_trigger_unlock_commit_regs(trace_file, buffer, event, - entry, irq_flags, pc, regs); + trace_event_buffer_commit(&fbuffer); } static void @@ -1223,36 +1223,35 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct trace_event_file *trace_file) { struct kretprobe_trace_entry_head *entry; - struct trace_buffer *buffer; - struct ring_buffer_event *event; - int size, pc, dsize; - unsigned long irq_flags; + struct trace_event_buffer fbuffer; struct trace_event_call *call = trace_probe_event_call(&tk->tp); + int dsize; WARN_ON(call != trace_file->event_call); if (trace_trigger_soft_disabled(trace_file)) return; - local_save_flags(irq_flags); - pc = preempt_count(); + local_save_flags(fbuffer.flags); + fbuffer.pc = preempt_count(); + fbuffer.trace_file = trace_file; dsize = __get_data_size(&tk->tp, regs); - size = sizeof(*entry) + tk->tp.size + dsize; - - event = trace_event_buffer_lock_reserve(&buffer, trace_file, - call->event.type, - size, irq_flags, pc); - if (!event) + fbuffer.event = + trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, + call->event.type, + sizeof(*entry) + tk->tp.size + dsize, + fbuffer.flags, fbuffer.pc); + if (!fbuffer.event) return; - entry = ring_buffer_event_data(event); + fbuffer.regs = regs; + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); - event_trigger_unlock_commit_regs(trace_file, buffer, event, - entry, irq_flags, pc, regs); + trace_event_buffer_commit(&fbuffer); } static void -- cgit v1.2.3 From d8d4c6d0e79c418f8c63f3c82429b1462f196155 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 11 Jan 2020 01:05:42 +0900 Subject: tracing: kprobes: Register to dynevent earlier stage Register kprobe event to dynevent in subsys_initcall level. This will allow kernel to register new kprobe events in fs_initcall level via trace_run_command. Link: http://lkml.kernel.org/r/157867234213.17873.18039000024374948737.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 33a6a661904b..8113d6aa7bc5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1685,11 +1685,12 @@ static __init void setup_boot_kprobe_events(void) enable_boot_kprobe_events(); } -/* Make a tracefs interface for controlling probe points */ -static __init int init_kprobe_trace(void) +/* + * Register dynevent at subsys_initcall. This allows kernel to setup kprobe + * events in fs_initcall without tracefs. + */ +static __init int init_kprobe_trace_early(void) { - struct dentry *d_tracer; - struct dentry *entry; int ret; ret = dyn_event_register(&trace_kprobe_ops); @@ -1699,6 +1700,16 @@ static __init int init_kprobe_trace(void) if (register_module_notifier(&trace_kprobe_module_nb)) return -EINVAL; + return 0; +} +subsys_initcall(init_kprobe_trace_early); + +/* Make a tracefs interface for controlling probe points */ +static __init int init_kprobe_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + d_tracer = tracing_init_dentry(); if (IS_ERR(d_tracer)) return 0; -- cgit v1.2.3 From b05e89ae7cf3bcabc52399cb833ecc9aaa51ae04 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 11 Jan 2020 01:05:53 +0900 Subject: tracing: Accept different type for synthetic event fields Make the synthetic event accepts a different type field to record. However, the size and signed flag must be same. Link: http://lkml.kernel.org/r/157867235358.17873.61732996461602171.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0454abaeb486..4f4759c6e972 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4110,8 +4110,11 @@ static int check_synth_field(struct synth_event *event, field = event->fields[field_pos]; - if (strcmp(field->type, hist_field->type) != 0) - return -EINVAL; + if (strcmp(field->type, hist_field->type) != 0) { + if (field->size != hist_field->size || + field->is_signed != hist_field->is_signed) + return -EINVAL; + } return 0; } -- cgit v1.2.3 From 48ac9488a597eb6116472b6cc0bd875e245e252c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 11 Jan 2020 01:06:05 +0900 Subject: tracing: Add NULL trace-array check in print_synth_event() Add NULL trace-array check in print_synth_event(), because if we enable tp_printk option, iter->tr can be NULL. Link: http://lkml.kernel.org/r/157867236536.17873.12529350542460184019.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 4f4759c6e972..1cb4c4c8e5b7 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -833,7 +833,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, fmt = synth_field_fmt(se->fields[i]->type); /* parameter types */ - if (tr->trace_flags & TRACE_ITER_VERBOSE) + if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) trace_seq_printf(s, "%s ", fmt); snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); -- cgit v1.2.3 From 9c5b9d3d65e485826fb935453f01171b1a337aa8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 11 Jan 2020 01:06:17 +0900 Subject: tracing/boot: Add boot-time tracing Setup tracing options via extra boot config in addition to kernel command line. This adds following commands support. These are applied to the global trace instance. - ftrace.options = OPT1[,OPT2...] Enable given ftrace options. - ftrace.trace_clock = CLOCK Set given CLOCK to ftrace's trace_clock. - ftrace.buffer_size = SIZE Configure ftrace buffer size to SIZE. You can use "KB" or "MB" for that SIZE. - ftrace.events = EVENT[, EVENT2...] Enable given events on boot. You can use a wild card in EVENT. - ftrace.tracer = TRACER Set TRACER to current tracer on boot. (e.g. function) Note that this is NOT replacing the kernel parameters, because this boot config based setting is later than that. If you want to trace earlier boot events, you still need kernel parameters. Link: http://lkml.kernel.org/r/157867237723.17873.17494943526320587488.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 9 ++++ kernel/trace/Makefile | 1 + kernel/trace/trace.c | 10 ++-- kernel/trace/trace_boot.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 128 insertions(+), 5 deletions(-) create mode 100644 kernel/trace/trace_boot.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 25a0fcfa7a5d..75326d8ab1af 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -814,6 +814,15 @@ config GCOV_PROFILE_FTRACE Note that on a kernel compiled with this config, ftrace will run significantly slower. +config BOOTTIME_TRACING + bool "Boot-time Tracing support" + depends on BOOT_CONFIG && TRACING + default y + help + Enable developer to setup ftrace subsystem via supplemental + kernel cmdline at boot time for debugging (tracing) driver + initialization and boot process. + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 0e63db62225f..395e2db9c742 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -83,6 +83,7 @@ endif obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o +obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cb850d2c4bfa..6c996d1b1687 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -162,7 +162,7 @@ union trace_eval_map_item { static union trace_eval_map_item *trace_eval_maps; #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ -static int tracing_set_tracer(struct trace_array *tr, const char *buf); +int tracing_set_tracer(struct trace_array *tr, const char *buf); static void ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc); @@ -4747,7 +4747,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) return 0; } -static int trace_set_options(struct trace_array *tr, char *option) +int trace_set_options(struct trace_array *tr, char *option) { char *cmp; int neg = 0; @@ -5647,8 +5647,8 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, return ret; } -static ssize_t tracing_resize_ring_buffer(struct trace_array *tr, - unsigned long size, int cpu_id) +ssize_t tracing_resize_ring_buffer(struct trace_array *tr, + unsigned long size, int cpu_id) { int ret = size; @@ -5727,7 +5727,7 @@ static void add_tracer_options(struct trace_array *tr, struct tracer *t) create_trace_option_files(tr, t); } -static int tracing_set_tracer(struct trace_array *tr, const char *buf) +int tracing_set_tracer(struct trace_array *tr, const char *buf) { struct tracer *t; #ifdef CONFIG_TRACER_MAX_TRACE diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c new file mode 100644 index 000000000000..4b41310184df --- /dev/null +++ b/kernel/trace/trace_boot.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_boot.c + * Tracing kernel boot-time + */ + +#define pr_fmt(fmt) "trace_boot: " fmt + +#include +#include +#include + +#include "trace.h" + +#define MAX_BUF_LEN 256 + +extern int trace_set_options(struct trace_array *tr, char *option); +extern int tracing_set_tracer(struct trace_array *tr, const char *buf); +extern ssize_t tracing_resize_ring_buffer(struct trace_array *tr, + unsigned long size, int cpu_id); + +static void __init +trace_boot_set_ftrace_options(struct trace_array *tr, struct xbc_node *node) +{ + struct xbc_node *anode; + const char *p; + char buf[MAX_BUF_LEN]; + unsigned long v = 0; + + /* Common ftrace options */ + xbc_node_for_each_array_value(node, "options", anode, p) { + if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) { + pr_err("String is too long: %s\n", p); + continue; + } + + if (trace_set_options(tr, buf) < 0) + pr_err("Failed to set option: %s\n", buf); + } + + p = xbc_node_find_value(node, "trace_clock", NULL); + if (p && *p != '\0') { + if (tracing_set_clock(tr, p) < 0) + pr_err("Failed to set trace clock: %s\n", p); + } + + p = xbc_node_find_value(node, "buffer_size", NULL); + if (p && *p != '\0') { + v = memparse(p, NULL); + if (v < PAGE_SIZE) + pr_err("Buffer size is too small: %s\n", p); + if (tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0) + pr_err("Failed to resize trace buffer to %s\n", p); + } +} + +#ifdef CONFIG_EVENT_TRACING +extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); + +static void __init +trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node) +{ + struct xbc_node *anode; + char buf[MAX_BUF_LEN]; + const char *p; + + xbc_node_for_each_array_value(node, "events", anode, p) { + if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) { + pr_err("String is too long: %s\n", p); + continue; + } + + if (ftrace_set_clr_event(tr, buf, 1) < 0) + pr_err("Failed to enable event: %s\n", p); + } +} +#else +#define trace_boot_enable_events(tr, node) do {} while (0) +#endif + +static void __init +trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node) +{ + const char *p; + + p = xbc_node_find_value(node, "tracer", NULL); + if (p && *p != '\0') { + if (tracing_set_tracer(tr, p) < 0) + pr_err("Failed to set given tracer: %s\n", p); + } +} + +static int __init trace_boot_init(void) +{ + struct xbc_node *trace_node; + struct trace_array *tr; + + trace_node = xbc_find_node("ftrace"); + if (!trace_node) + return 0; + + tr = top_trace_array(); + if (!tr) + return 0; + + trace_boot_set_ftrace_options(tr, trace_node); + trace_boot_enable_events(tr, trace_node); + trace_boot_enable_tracer(tr, trace_node); + + return 0; +} + +fs_initcall(trace_boot_init); -- cgit v1.2.3 From 81a59555ff1593642824414267e1859024bd0162 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 11 Jan 2020 01:06:29 +0900 Subject: tracing/boot: Add per-event settings Add per-event settings for boottime tracing. User can set filter, actions and enable on each event on boot. The event entries are under ftrace.event.GROUP.EVENT node (note that the option key includes event's group name and event name.) This supports below configs. - ftrace.event.GROUP.EVENT.enable Enables GROUP:EVENT tracing. - ftrace.event.GROUP.EVENT.filter = FILTER Set FILTER rule to the GROUP:EVENT. - ftrace.event.GROUP.EVENT.actions = ACTION[, ACTION2...] Set ACTIONs to the GROUP:EVENT. For example, ftrace.event.sched.sched_process_exec { filter = "pid < 128" enable } this will enable tracing "sched:sched_process_exec" event with "pid < 128" filter. Link: http://lkml.kernel.org/r/157867238942.17873.11177628789184546198.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 60 +++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events_trigger.c | 2 +- 2 files changed, 61 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 4b41310184df..37524031533e 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -56,6 +56,7 @@ trace_boot_set_ftrace_options(struct trace_array *tr, struct xbc_node *node) #ifdef CONFIG_EVENT_TRACING extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); +extern int trigger_process_regex(struct trace_event_file *file, char *buff); static void __init trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node) @@ -74,8 +75,66 @@ trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node) pr_err("Failed to enable event: %s\n", p); } } + +static void __init +trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, + struct xbc_node *enode) +{ + struct trace_event_file *file; + struct xbc_node *anode; + char buf[MAX_BUF_LEN]; + const char *p, *group, *event; + + group = xbc_node_get_data(gnode); + event = xbc_node_get_data(enode); + + mutex_lock(&event_mutex); + file = find_event_file(tr, group, event); + if (!file) { + pr_err("Failed to find event: %s:%s\n", group, event); + goto out; + } + + p = xbc_node_find_value(enode, "filter", NULL); + if (p && *p != '\0') { + if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) + pr_err("filter string is too long: %s\n", p); + else if (apply_event_filter(file, buf) < 0) + pr_err("Failed to apply filter: %s\n", buf); + } + + xbc_node_for_each_array_value(enode, "actions", anode, p) { + if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) + pr_err("action string is too long: %s\n", p); + else if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply an action: %s\n", buf); + } + + if (xbc_node_find_value(enode, "enable", NULL)) { + if (trace_event_enable_disable(file, 1, 0) < 0) + pr_err("Failed to enable event node: %s:%s\n", + group, event); + } +out: + mutex_unlock(&event_mutex); +} + +static void __init +trace_boot_init_events(struct trace_array *tr, struct xbc_node *node) +{ + struct xbc_node *gnode, *enode; + + node = xbc_node_find_child(node, "event"); + if (!node) + return; + /* per-event key starts with "event.GROUP.EVENT" */ + xbc_node_for_each_child(node, gnode) + xbc_node_for_each_child(gnode, enode) + trace_boot_init_one_event(tr, gnode, enode); +} #else #define trace_boot_enable_events(tr, node) do {} while (0) +#define trace_boot_init_events(tr, node) do {} while (0) #endif static void __init @@ -104,6 +163,7 @@ static int __init trace_boot_init(void) return 0; trace_boot_set_ftrace_options(tr, trace_node); + trace_boot_init_events(tr, trace_node); trace_boot_enable_events(tr, trace_node); trace_boot_enable_tracer(tr, trace_node); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 2cd53ca21b51..d8ada4c6f3f7 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -213,7 +213,7 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) return ret; } -static int trigger_process_regex(struct trace_event_file *file, char *buff) +int trigger_process_regex(struct trace_event_file *file, char *buff) { char *command, *next = buff; struct event_command *p; -- cgit v1.2.3 From 4d655281eb1bb59fad021c0f68afd033f8d0320d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 11 Jan 2020 01:06:41 +0900 Subject: tracing/boot Add kprobe event support Add kprobe event support on event node to boot-time tracing. If the group name of event is "kprobes", the boot-time tracing defines new probe event according to "probes" values. - ftrace.event.kprobes.EVENT.probes = PROBE[, PROBE2...] Defines new kprobe event based on PROBEs. It is able to define multiple probes on one event, but those must have same type of arguments. For example, ftrace.events.kprobes.myevent { probes = "vfs_read $arg1 $arg2"; enable; } This will add kprobes:myevent on vfs_read with the 1st and the 2nd arguments. Link: http://lkml.kernel.org/r/157867240104.17873.9712052065426433111.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 46 +++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_kprobe.c | 5 +++++ 2 files changed, 51 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 37524031533e..a11dc60299fb 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -76,6 +76,48 @@ trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node) } } +#ifdef CONFIG_KPROBE_EVENTS +extern int trace_kprobe_run_command(const char *command); + +static int __init +trace_boot_add_kprobe_event(struct xbc_node *node, const char *event) +{ + struct xbc_node *anode; + char buf[MAX_BUF_LEN]; + const char *val; + char *p; + int len; + + len = snprintf(buf, ARRAY_SIZE(buf) - 1, "p:kprobes/%s ", event); + if (len >= ARRAY_SIZE(buf)) { + pr_err("Event name is too long: %s\n", event); + return -E2BIG; + } + p = buf + len; + len = ARRAY_SIZE(buf) - len; + + xbc_node_for_each_array_value(node, "probes", anode, val) { + if (strlcpy(p, val, len) >= len) { + pr_err("Probe definition is too long: %s\n", val); + return -E2BIG; + } + if (trace_kprobe_run_command(buf) < 0) { + pr_err("Failed to add probe: %s\n", buf); + return -EINVAL; + } + } + + return 0; +} +#else +static inline int __init +trace_boot_add_kprobe_event(struct xbc_node *node, const char *event) +{ + pr_err("Kprobe event is not supported.\n"); + return -ENOTSUPP; +} +#endif + static void __init trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, struct xbc_node *enode) @@ -88,6 +130,10 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, group = xbc_node_get_data(gnode); event = xbc_node_get_data(enode); + if (!strcmp(group, "kprobes")) + if (trace_boot_add_kprobe_event(enode, event) < 0) + return; + mutex_lock(&event_mutex); file = find_event_file(tr, group, event); if (!file) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8113d6aa7bc5..283b7c437440 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -902,6 +902,11 @@ static int create_or_delete_trace_kprobe(int argc, char **argv) return ret == -ECANCELED ? -EINVAL : ret; } +int trace_kprobe_run_command(const char *command) +{ + return trace_run_command(command, create_or_delete_trace_kprobe); +} + static int trace_kprobe_release(struct dyn_event *ev) { struct trace_kprobe *tk = to_trace_kprobe(ev); -- cgit v1.2.3 From 3fbe2d6e1fce255d918b622fb2af22e98364a154 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 11 Jan 2020 01:06:52 +0900 Subject: tracing/boot: Add synthetic event support Add synthetic event node support to boot time tracing. The synthetic event is a kind of event node, but the group name is "synthetic". - ftrace.event.synthetic.EVENT.fields = FIELD[, FIELD2...] Defines new synthetic event with FIELDs. Each field should be "type varname". The synthetic node requires "fields" string arraies, which defines the fields as same as tracing/synth_events interface. Link: http://lkml.kernel.org/r/157867241236.17873.12411615143321557709.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 47 ++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events_hist.c | 5 +++++ 2 files changed, 52 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index a11dc60299fb..3054921b0877 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -118,6 +118,50 @@ trace_boot_add_kprobe_event(struct xbc_node *node, const char *event) } #endif +#ifdef CONFIG_HIST_TRIGGERS +extern int synth_event_run_command(const char *command); + +static int __init +trace_boot_add_synth_event(struct xbc_node *node, const char *event) +{ + struct xbc_node *anode; + char buf[MAX_BUF_LEN], *q; + const char *p; + int len, delta, ret; + + len = ARRAY_SIZE(buf); + delta = snprintf(buf, len, "%s", event); + if (delta >= len) { + pr_err("Event name is too long: %s\n", event); + return -E2BIG; + } + len -= delta; q = buf + delta; + + xbc_node_for_each_array_value(node, "fields", anode, p) { + delta = snprintf(q, len, " %s;", p); + if (delta >= len) { + pr_err("fields string is too long: %s\n", p); + return -E2BIG; + } + len -= delta; q += delta; + } + + ret = synth_event_run_command(buf); + if (ret < 0) + pr_err("Failed to add synthetic event: %s\n", buf); + + + return ret; +} +#else +static inline int __init +trace_boot_add_synth_event(struct xbc_node *node, const char *event) +{ + pr_err("Synthetic event is not supported.\n"); + return -ENOTSUPP; +} +#endif + static void __init trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, struct xbc_node *enode) @@ -133,6 +177,9 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, if (!strcmp(group, "kprobes")) if (trace_boot_add_kprobe_event(enode, event) < 0) return; + if (!strcmp(group, "synthetic")) + if (trace_boot_add_synth_event(enode, event) < 0) + return; mutex_lock(&event_mutex); file = find_event_file(tr, group, event); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1cb4c4c8e5b7..8e90f1ada437 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1384,6 +1384,11 @@ static int create_or_delete_synth_event(int argc, char **argv) return ret == -ECANCELED ? -EINVAL : ret; } +int synth_event_run_command(const char *command) +{ + return trace_run_command(command, create_or_delete_synth_event); +} + static int synth_event_create(int argc, const char **argv) { const char *name = argv[0]; -- cgit v1.2.3 From 4f712a4d04a4e49167118b41d8ea96df70a98985 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 11 Jan 2020 01:07:04 +0900 Subject: tracing/boot: Add instance node support Add instance node support to boot-time tracing. User can set some options and event nodes under instance node. - ftrace.instance.INSTANCE[...] Add new INSTANCE instance. Some options and event nodes are acceptable for instance node. Link: http://lkml.kernel.org/r/157867242413.17873.9814204526141500278.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 43 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 3054921b0877..f5db30d25b0b 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -20,7 +20,7 @@ extern ssize_t tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu_id); static void __init -trace_boot_set_ftrace_options(struct trace_array *tr, struct xbc_node *node) +trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) { struct xbc_node *anode; const char *p; @@ -242,6 +242,40 @@ trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node) } } +static void __init +trace_boot_init_one_instance(struct trace_array *tr, struct xbc_node *node) +{ + trace_boot_set_instance_options(tr, node); + trace_boot_init_events(tr, node); + trace_boot_enable_events(tr, node); + trace_boot_enable_tracer(tr, node); +} + +static void __init +trace_boot_init_instances(struct xbc_node *node) +{ + struct xbc_node *inode; + struct trace_array *tr; + const char *p; + + node = xbc_node_find_child(node, "instance"); + if (!node) + return; + + xbc_node_for_each_child(node, inode) { + p = xbc_node_get_data(inode); + if (!p || *p == '\0') + continue; + + tr = trace_array_get_by_name(p); + if (IS_ERR(tr)) { + pr_err("Failed to get trace instance %s\n", p); + continue; + } + trace_boot_init_one_instance(tr, inode); + } +} + static int __init trace_boot_init(void) { struct xbc_node *trace_node; @@ -255,10 +289,9 @@ static int __init trace_boot_init(void) if (!tr) return 0; - trace_boot_set_ftrace_options(tr, trace_node); - trace_boot_init_events(tr, trace_node); - trace_boot_enable_events(tr, trace_node); - trace_boot_enable_tracer(tr, trace_node); + /* Global trace array is also one instance */ + trace_boot_init_one_instance(tr, trace_node); + trace_boot_init_instances(trace_node); return 0; } -- cgit v1.2.3 From 9d15dbbde1048e490f3773e67d7ef7604bab1409 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 11 Jan 2020 01:07:16 +0900 Subject: tracing/boot: Add cpu_mask option support Add ftrace.cpumask option support to boot-time tracing. This sets cpumask for each instance. - ftrace.[instance.INSTANCE.]cpumask = CPUMASK; Set the trace cpumask. Note that the CPUMASK should be a string which /tracing_cpumask can accepts. Link: http://lkml.kernel.org/r/157867243625.17873.13613922641273149372.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 42 +++++++++++++++++++++++++++++------------- kernel/trace/trace_boot.c | 14 ++++++++++++++ 2 files changed, 43 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6c996d1b1687..106bbc0988fe 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4561,20 +4561,13 @@ out_err: return count; } -static ssize_t -tracing_cpumask_write(struct file *filp, const char __user *ubuf, - size_t count, loff_t *ppos) +int tracing_set_cpumask(struct trace_array *tr, + cpumask_var_t tracing_cpumask_new) { - struct trace_array *tr = file_inode(filp)->i_private; - cpumask_var_t tracing_cpumask_new; - int err, cpu; - - if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) - return -ENOMEM; + int cpu; - err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); - if (err) - goto err_unlock; + if (!tr) + return -EINVAL; local_irq_disable(); arch_spin_lock(&tr->max_lock); @@ -4598,11 +4591,34 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, local_irq_enable(); cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); + + return 0; +} + +static ssize_t +tracing_cpumask_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct trace_array *tr = file_inode(filp)->i_private; + cpumask_var_t tracing_cpumask_new; + int err; + + if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) + return -ENOMEM; + + err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); + if (err) + goto err_free; + + err = tracing_set_cpumask(tr, tracing_cpumask_new); + if (err) + goto err_free; + free_cpumask_var(tracing_cpumask_new); return count; -err_unlock: +err_free: free_cpumask_var(tracing_cpumask_new); return err; diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index f5db30d25b0b..81d923c16a4d 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -18,6 +18,8 @@ extern int trace_set_options(struct trace_array *tr, char *option); extern int tracing_set_tracer(struct trace_array *tr, const char *buf); extern ssize_t tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu_id); +extern int tracing_set_cpumask(struct trace_array *tr, + cpumask_var_t tracing_cpumask_new); static void __init trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) @@ -52,6 +54,18 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) if (tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0) pr_err("Failed to resize trace buffer to %s\n", p); } + + p = xbc_node_find_value(node, "cpumask", NULL); + if (p && *p != '\0') { + cpumask_var_t new_mask; + + if (alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + if (cpumask_parse(p, new_mask) < 0 || + tracing_set_cpumask(tr, new_mask) < 0) + pr_err("Failed to set new CPU mask %s\n", p); + free_cpumask_var(new_mask); + } + } } #ifdef CONFIG_EVENT_TRACING -- cgit v1.2.3 From fe1efe9252f938f0cc624e4ac817c27bcaef6ed0 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 11 Jan 2020 01:07:28 +0900 Subject: tracing/boot: Add function tracer filter options Add below function-tracer filter options to boot-time tracing. - ftrace.[instance.INSTANCE.]ftrace.filters This will take an array of tracing function filter rules - ftrace.[instance.INSTANCE.]ftrace.notraces This will take an array of NON-tracing function filter rules Link: http://lkml.kernel.org/r/157867244841.17873.10933616628243103561.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 81d923c16a4d..fa9603dc6469 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -244,11 +244,51 @@ trace_boot_init_events(struct trace_array *tr, struct xbc_node *node) #define trace_boot_init_events(tr, node) do {} while (0) #endif +#ifdef CONFIG_DYNAMIC_FTRACE +extern bool ftrace_filter_param __initdata; +extern int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset); +extern int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset); +static void __init +trace_boot_set_ftrace_filter(struct trace_array *tr, struct xbc_node *node) +{ + struct xbc_node *anode; + const char *p; + char *q; + + xbc_node_for_each_array_value(node, "ftrace.filters", anode, p) { + q = kstrdup(p, GFP_KERNEL); + if (!q) + return; + if (ftrace_set_filter(tr->ops, q, strlen(q), 0) < 0) + pr_err("Failed to add %s to ftrace filter\n", p); + else + ftrace_filter_param = true; + kfree(q); + } + xbc_node_for_each_array_value(node, "ftrace.notraces", anode, p) { + q = kstrdup(p, GFP_KERNEL); + if (!q) + return; + if (ftrace_set_notrace(tr->ops, q, strlen(q), 0) < 0) + pr_err("Failed to add %s to ftrace filter\n", p); + else + ftrace_filter_param = true; + kfree(q); + } +} +#else +#define trace_boot_set_ftrace_filter(tr, node) do {} while (0) +#endif + static void __init trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node) { const char *p; + trace_boot_set_ftrace_filter(tr, node); + p = xbc_node_find_value(node, "tracer", NULL); if (p && *p != '\0') { if (tracing_set_tracer(tr, p) < 0) -- cgit v1.2.3 From 8649c322f75c96e7ced2fec201e123b2b073bf09 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Tue, 7 Jan 2020 09:59:25 -0800 Subject: pid: Implement pidfd_getfd syscall This syscall allows for the retrieval of file descriptors from other processes, based on their pidfd. This is possible using ptrace, and injection of parasitic code to inject code which leverages SCM_RIGHTS to move file descriptors between a tracee and a tracer. Unfortunately, ptrace comes with a high cost of requiring the process to be stopped, and breaks debuggers. This does not require stopping the process under manipulation. One reason to use this is to allow sandboxers to take actions on file descriptors on the behalf of another process. For example, this can be combined with seccomp-bpf's user notification to do on-demand fd extraction and take privileged actions. One such privileged action is binding a socket to a privileged port. /* prototype */ /* flags is currently reserved and should be set to 0 */ int sys_pidfd_getfd(int pidfd, int fd, unsigned int flags); /* testing */ Ran self-test suite on x86_64 Signed-off-by: Sargun Dhillon Acked-by: Christian Brauner Reviewed-by: Arnd Bergmann Link: https://lore.kernel.org/r/20200107175927.4558-3-sargun@sargun.me Signed-off-by: Christian Brauner --- kernel/pid.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 2278e249141d..0f4ecb57214c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -578,3 +578,93 @@ void __init pid_idr_init(void) init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); } + +static struct file *__pidfd_fget(struct task_struct *task, int fd) +{ + struct file *file; + int ret; + + ret = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (ret) + return ERR_PTR(ret); + + if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) + file = fget_task(task, fd); + else + file = ERR_PTR(-EPERM); + + mutex_unlock(&task->signal->cred_guard_mutex); + + return file ?: ERR_PTR(-EBADF); +} + +static int pidfd_getfd(struct pid *pid, int fd) +{ + struct task_struct *task; + struct file *file; + int ret; + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) + return -ESRCH; + + file = __pidfd_fget(task, fd); + put_task_struct(task); + if (IS_ERR(file)) + return PTR_ERR(file); + + ret = security_file_receive(file); + if (ret) { + fput(file); + return ret; + } + + ret = get_unused_fd_flags(O_CLOEXEC); + if (ret < 0) + fput(file); + else + fd_install(ret, file); + + return ret; +} + +/** + * sys_pidfd_getfd() - Get a file descriptor from another process + * + * @pidfd: the pidfd file descriptor of the process + * @fd: the file descriptor number to get + * @flags: flags on how to get the fd (reserved) + * + * This syscall gets a copy of a file descriptor from another process + * based on the pidfd, and file descriptor number. It requires that + * the calling process has the ability to ptrace the process represented + * by the pidfd. The process which is having its file descriptor copied + * is otherwise unaffected. + * + * Return: On success, a cloexec file descriptor is returned. + * On error, a negative errno number will be returned. + */ +SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd, + unsigned int, flags) +{ + struct pid *pid; + struct fd f; + int ret; + + /* flags is currently unused - make sure it's unset */ + if (flags) + return -EINVAL; + + f = fdget(pidfd); + if (!f.file) + return -EBADF; + + pid = pidfd_pid(f.file); + if (IS_ERR(pid)) + ret = PTR_ERR(pid); + else + ret = pidfd_getfd(pid, fd); + + fdput(f); + return ret; +} -- cgit v1.2.3 From 3b42a4c83a31d8f1d8a7cb7eb2f4ee809d42c69d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 20 Dec 2019 11:31:43 +0900 Subject: tracing: trigger: Replace unneeded RCU-list traversals With CONFIG_PROVE_RCU_LIST, I had many suspicious RCU warnings when I ran ftracetest trigger testcases. ----- # dmesg -c > /dev/null # ./ftracetest test.d/trigger ... # dmesg | grep "RCU-list traversed" | cut -f 2 -d ] | cut -f 2 -d " " kernel/trace/trace_events_hist.c:6070 kernel/trace/trace_events_hist.c:1760 kernel/trace/trace_events_hist.c:5911 kernel/trace/trace_events_trigger.c:504 kernel/trace/trace_events_hist.c:1810 kernel/trace/trace_events_hist.c:3158 kernel/trace/trace_events_hist.c:3105 kernel/trace/trace_events_hist.c:5518 kernel/trace/trace_events_hist.c:5998 kernel/trace/trace_events_hist.c:6019 kernel/trace/trace_events_hist.c:6044 kernel/trace/trace_events_trigger.c:1500 kernel/trace/trace_events_trigger.c:1540 kernel/trace/trace_events_trigger.c:539 kernel/trace/trace_events_trigger.c:584 ----- I investigated those warnings and found that the RCU-list traversals in event trigger and hist didn't need to use RCU version because those were called only under event_mutex. I also checked other RCU-list traversals related to event trigger list, and found that most of them were called from event_hist_trigger_func() or hist_unregister_trigger() or register/unregister functions except for a few cases. Replace these unneeded RCU-list traversals with normal list traversal macro and lockdep_assert_held() to check the event_mutex is held. Link: http://lkml.kernel.org/r/157680910305.11685.15110237954275915782.stgit@devnote2 Reviewed-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 41 +++++++++++++++++++++++++++---------- kernel/trace/trace_events_trigger.c | 20 +++++++++++++----- 2 files changed, 45 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 8e90f1ada437..117a1202a6b9 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1771,11 +1771,13 @@ static struct hist_field *find_var(struct hist_trigger_data *hist_data, struct event_trigger_data *test; struct hist_field *hist_field; + lockdep_assert_held(&event_mutex); + hist_field = find_var_field(hist_data, var_name); if (hist_field) return hist_field; - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { test_data = test->private_data; hist_field = find_var_field(test_data, var_name); @@ -1825,7 +1827,9 @@ static struct hist_field *find_file_var(struct trace_event_file *file, struct event_trigger_data *test; struct hist_field *hist_field; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { test_data = test->private_data; hist_field = find_var_field(test_data, var_name); @@ -3120,7 +3124,9 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data, { struct event_trigger_data *test; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (test->private_data == hist_data) return test->filter_str; @@ -3171,9 +3177,11 @@ find_compatible_hist(struct hist_trigger_data *target_hist_data, struct event_trigger_data *test; unsigned int n_keys; + lockdep_assert_held(&event_mutex); + n_keys = target_hist_data->n_fields - target_hist_data->n_vals; - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { hist_data = test->private_data; @@ -5536,7 +5544,7 @@ static int hist_show(struct seq_file *m, void *v) goto out_unlock; } - list_for_each_entry_rcu(data, &event_file->triggers, list) { + list_for_each_entry(data, &event_file->triggers, list) { if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) hist_trigger_show(m, data, n++); } @@ -5929,7 +5937,9 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, if (hist_data->attrs->name && !named_data) goto new; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (!hist_trigger_match(data, test, named_data, false)) continue; @@ -6013,10 +6023,12 @@ static bool have_hist_trigger_match(struct event_trigger_data *data, struct event_trigger_data *test, *named_data = NULL; bool match = false; + lockdep_assert_held(&event_mutex); + if (hist_data->attrs->name) named_data = find_named_trigger(hist_data->attrs->name); - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (hist_trigger_match(data, test, named_data, false)) { match = true; @@ -6034,10 +6046,12 @@ static bool hist_trigger_check_refs(struct event_trigger_data *data, struct hist_trigger_data *hist_data = data->private_data; struct event_trigger_data *test, *named_data = NULL; + lockdep_assert_held(&event_mutex); + if (hist_data->attrs->name) named_data = find_named_trigger(hist_data->attrs->name); - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (!hist_trigger_match(data, test, named_data, false)) continue; @@ -6059,10 +6073,12 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *test, *named_data = NULL; bool unregistered = false; + lockdep_assert_held(&event_mutex); + if (hist_data->attrs->name) named_data = find_named_trigger(hist_data->attrs->name); - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (!hist_trigger_match(data, test, named_data, false)) continue; @@ -6088,7 +6104,9 @@ static bool hist_file_check_refs(struct trace_event_file *file) struct hist_trigger_data *hist_data; struct event_trigger_data *test; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { hist_data = test->private_data; if (check_var_refs(hist_data)) @@ -6331,7 +6349,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec, struct enable_trigger_data *enable_data = data->private_data; struct event_trigger_data *test; - list_for_each_entry_rcu(test, &enable_data->file->triggers, list) { + list_for_each_entry_rcu(test, &enable_data->file->triggers, list, + lockdep_is_held(&event_mutex)) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (enable_data->enable) test->paused = false; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d8ada4c6f3f7..60959c31791d 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -501,7 +501,9 @@ void update_cond_flag(struct trace_event_file *file) struct event_trigger_data *data; bool set_cond = false; - list_for_each_entry_rcu(data, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(data, &file->triggers, list) { if (data->filter || event_command_post_trigger(data->cmd_ops) || event_command_needs_rec(data->cmd_ops)) { set_cond = true; @@ -536,7 +538,9 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *test; int ret = 0; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) { ret = -EEXIST; goto out; @@ -581,7 +585,9 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data; bool unregistered = false; - list_for_each_entry_rcu(data, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(data, &file->triggers, list) { if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { unregistered = true; list_del_rcu(&data->list); @@ -1497,7 +1503,9 @@ int event_enable_register_trigger(char *glob, struct event_trigger_data *test; int ret = 0; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { test_enable_data = test->private_data; if (test_enable_data && (test->cmd_ops->trigger_type == @@ -1537,7 +1545,9 @@ void event_enable_unregister_trigger(char *glob, struct event_trigger_data *data; bool unregistered = false; - list_for_each_entry_rcu(data, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(data, &file->triggers, list) { enable_data = data->private_data; if (enable_data && (data->cmd_ops->trigger_type == -- cgit v1.2.3 From 769071ac9f20b6a447410c7eaa55d1a5233ef40c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:52 +0000 Subject: ns: Introduce Time Namespace Time Namespace isolates clock values. The kernel provides access to several clocks CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, etc. CLOCK_REALTIME System-wide clock that measures real (i.e., wall-clock) time. CLOCK_MONOTONIC Clock that cannot be set and represents monotonic time since some unspecified starting point. CLOCK_BOOTTIME Identical to CLOCK_MONOTONIC, except it also includes any time that the system is suspended. For many users, the time namespace means the ability to changes date and time in a container (CLOCK_REALTIME). Providing per namespace notions of CLOCK_REALTIME would be complex with a massive overhead, but has a dubious value. But in the context of checkpoint/restore functionality, monotonic and boottime clocks become interesting. Both clocks are monotonic with unspecified starting points. These clocks are widely used to measure time slices and set timers. After restoring or migrating processes, it has to be guaranteed that they never go backward. In an ideal case, the behavior of these clocks should be the same as for a case when a whole system is suspended. All this means that it is required to set CLOCK_MONOTONIC and CLOCK_BOOTTIME clocks, which can be achieved by adding per-namespace offsets for clocks. A time namespace is similar to a pid namespace in the way how it is created: unshare(CLONE_NEWTIME) system call creates a new time namespace, but doesn't set it to the current process. Then all children of the process will be born in the new time namespace, or a process can use the setns() system call to join a namespace. This scheme allows setting clock offsets for a namespace, before any processes appear in it. All available clone flags have been used, so CLONE_NEWTIME uses the highest bit of CSIGNAL. It means that it can be used only with the unshare() and the clone3() system calls. [ tglx: Adjusted paragraph about clone3() to reality and massaged the changelog a bit. ] Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://criu.org/Time_namespace Link: https://lists.openvz.org/pipermail/criu/2018-June/041504.html Link: https://lore.kernel.org/r/20191112012724.250792-4-dima@arista.com --- kernel/fork.c | 16 +++- kernel/nsproxy.c | 41 +++++++-- kernel/time/Makefile | 1 + kernel/time/namespace.c | 217 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 265 insertions(+), 10 deletions(-) create mode 100644 kernel/time/namespace.c (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 2508a4f238a3..363595815144 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1832,6 +1832,7 @@ static __latent_entropy struct task_struct *copy_process( struct multiprocess_signals delayed; struct file *pidfile = NULL; u64 clone_flags = args->flags; + struct nsproxy *nsp = current->nsproxy; /* * Don't allow sharing the root directory with processes in a different @@ -1874,8 +1875,16 @@ static __latent_entropy struct task_struct *copy_process( */ if (clone_flags & CLONE_THREAD) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || - (task_active_pid_ns(current) != - current->nsproxy->pid_ns_for_children)) + (task_active_pid_ns(current) != nsp->pid_ns_for_children)) + return ERR_PTR(-EINVAL); + } + + /* + * If the new process will be in a different time namespace + * do not allow it to share VM or a thread group with the forking task. + */ + if (clone_flags & (CLONE_THREAD | CLONE_VM)) { + if (nsp->time_ns != nsp->time_ns_for_children) return ERR_PTR(-EINVAL); } @@ -2811,7 +2820,8 @@ static int check_unshare_flags(unsigned long unshare_flags) if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| - CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP)) + CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP| + CLONE_NEWTIME)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index c815f58e6bc0..ed9882108cd2 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,10 @@ struct nsproxy init_nsproxy = { #ifdef CONFIG_CGROUPS .cgroup_ns = &init_cgroup_ns, #endif +#ifdef CONFIG_TIME_NS + .time_ns = &init_time_ns, + .time_ns_for_children = &init_time_ns, +#endif }; static inline struct nsproxy *create_nsproxy(void) @@ -106,8 +111,18 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_net; } + new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns, + tsk->nsproxy->time_ns_for_children); + if (IS_ERR(new_nsp->time_ns_for_children)) { + err = PTR_ERR(new_nsp->time_ns_for_children); + goto out_time; + } + new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns); + return new_nsp; +out_time: + put_net(new_nsp->net_ns); out_net: put_cgroup_ns(new_nsp->cgroup_ns); out_cgroup: @@ -136,15 +151,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns; + int ret; if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET | - CLONE_NEWCGROUP)))) { - get_nsproxy(old_ns); - return 0; - } - - if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + CLONE_NEWCGROUP | CLONE_NEWTIME)))) { + if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) { + get_nsproxy(old_ns); + return 0; + } + } else if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; /* @@ -162,6 +178,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) if (IS_ERR(new_ns)) return PTR_ERR(new_ns); + ret = timens_on_fork(new_ns, tsk); + if (ret) { + free_nsproxy(new_ns); + return ret; + } + tsk->nsproxy = new_ns; return 0; } @@ -176,6 +198,10 @@ void free_nsproxy(struct nsproxy *ns) put_ipc_ns(ns->ipc_ns); if (ns->pid_ns_for_children) put_pid_ns(ns->pid_ns_for_children); + if (ns->time_ns) + put_time_ns(ns->time_ns); + if (ns->time_ns_for_children) + put_time_ns(ns->time_ns_for_children); put_cgroup_ns(ns->cgroup_ns); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); @@ -192,7 +218,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP))) + CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP | + CLONE_NEWTIME))) return 0; user_ns = new_cred ? new_cred->user_ns : current_user_ns(); diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 1867044800bb..c8f00168afe8 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o +obj-$(CONFIG_TIME_NS) += namespace.o diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c new file mode 100644 index 000000000000..2662a69e0382 --- /dev/null +++ b/kernel/time/namespace.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Author: Andrei Vagin + * Author: Dmitry Safonov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct ucounts *inc_time_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); +} + +static void dec_time_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES); +} + +/** + * clone_time_ns - Clone a time namespace + * @user_ns: User namespace which owns a new namespace. + * @old_ns: Namespace to clone + * + * Clone @old_ns and set the clone refcount to 1 + * + * Return: The new namespace or ERR_PTR. + */ +static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, + struct time_namespace *old_ns) +{ + struct time_namespace *ns; + struct ucounts *ucounts; + int err; + + err = -ENOSPC; + ucounts = inc_time_namespaces(user_ns); + if (!ucounts) + goto fail; + + err = -ENOMEM; + ns = kmalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) + goto fail_dec; + + kref_init(&ns->kref); + + err = ns_alloc_inum(&ns->ns); + if (err) + goto fail_free; + + ns->ucounts = ucounts; + ns->ns.ops = &timens_operations; + ns->user_ns = get_user_ns(user_ns); + return ns; + +fail_free: + kfree(ns); +fail_dec: + dec_time_namespaces(ucounts); +fail: + return ERR_PTR(err); +} + +/** + * copy_time_ns - Create timens_for_children from @old_ns + * @flags: Cloning flags + * @user_ns: User namespace which owns a new namespace. + * @old_ns: Namespace to clone + * + * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children; + * adds a refcounter to @old_ns otherwise. + * + * Return: timens_for_children namespace or ERR_PTR. + */ +struct time_namespace *copy_time_ns(unsigned long flags, + struct user_namespace *user_ns, struct time_namespace *old_ns) +{ + if (!(flags & CLONE_NEWTIME)) + return get_time_ns(old_ns); + + return clone_time_ns(user_ns, old_ns); +} + +void free_time_ns(struct kref *kref) +{ + struct time_namespace *ns; + + ns = container_of(kref, struct time_namespace, kref); + dec_time_namespaces(ns->ucounts); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + kfree(ns); +} + +static struct time_namespace *to_time_ns(struct ns_common *ns) +{ + return container_of(ns, struct time_namespace, ns); +} + +static struct ns_common *timens_get(struct task_struct *task) +{ + struct time_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->time_ns; + get_time_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static struct ns_common *timens_for_children_get(struct task_struct *task) +{ + struct time_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->time_ns_for_children; + get_time_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void timens_put(struct ns_common *ns) +{ + put_time_ns(to_time_ns(ns)); +} + +static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) +{ + struct time_namespace *ns = to_time_ns(new); + + if (!current_is_single_threaded()) + return -EUSERS; + + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return -EPERM; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns); + nsproxy->time_ns = ns; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns_for_children); + nsproxy->time_ns_for_children = ns; + return 0; +} + +int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) +{ + struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; + struct time_namespace *ns = to_time_ns(nsc); + + /* create_new_namespaces() already incremented the ref counter */ + if (nsproxy->time_ns == nsproxy->time_ns_for_children) + return 0; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns); + nsproxy->time_ns = ns; + + return 0; +} + +static struct user_namespace *timens_owner(struct ns_common *ns) +{ + return to_time_ns(ns)->user_ns; +} + +const struct proc_ns_operations timens_operations = { + .name = "time", + .type = CLONE_NEWTIME, + .get = timens_get, + .put = timens_put, + .install = timens_install, + .owner = timens_owner, +}; + +const struct proc_ns_operations timens_for_children_operations = { + .name = "time_for_children", + .type = CLONE_NEWTIME, + .get = timens_for_children_get, + .put = timens_put, + .install = timens_install, + .owner = timens_owner, +}; + +struct time_namespace init_time_ns = { + .kref = KREF_INIT(3), + .user_ns = &init_user_ns, + .ns.inum = PROC_TIME_INIT_INO, + .ns.ops = &timens_operations, +}; + +static int __init time_ns_init(void) +{ + return 0; +} +subsys_initcall(time_ns_init); -- cgit v1.2.3 From af993f58d69ee9c1f421dfc87c3ed231c113989c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:53 +0000 Subject: time: Add timens_offsets to be used for tasks in time namespace Introduce offsets for time namespace. They will contain an adjustment needed to convert clocks to/from host's. A new namespace is created with the same offsets as the time namespace of the current process. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-5-dima@arista.com --- kernel/time/namespace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 2662a69e0382..c2a58e45fc4b 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -14,6 +14,7 @@ #include #include #include +#include static struct ucounts *inc_time_namespaces(struct user_namespace *ns) { @@ -60,6 +61,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, ns->ucounts = ucounts; ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); + ns->offsets = old_ns->offsets; return ns; fail_free: -- cgit v1.2.3 From 819a95fe3adfc7b558bfd96dd5ac589c4f543fd4 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:54 +0000 Subject: posix-clocks: Rename the clock_get() callback to clock_get_timespec() The upcoming support for time namespaces requires to have access to: - The time in a task's time namespace for sys_clock_gettime() - The time in the root name space for common_timer_get() That adds a valid reason to finally implement a separate callback which returns the time in ktime_t format, rather than in (struct timespec). Rename the clock_get() callback to clock_get_timespec() as a preparation for introducing clock_get_ktime(). Suggested-by: Thomas Gleixner Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-6-dima@arista.com --- kernel/time/alarmtimer.c | 4 ++-- kernel/time/posix-clock.c | 8 ++++---- kernel/time/posix-cpu-timers.c | 32 ++++++++++++++++---------------- kernel/time/posix-timers.c | 22 +++++++++++----------- kernel/time/posix-timers.h | 4 ++-- 5 files changed, 35 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 451f9d05ccfe..8523df726fee 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -657,7 +657,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp } /** - * alarm_clock_get - posix clock_get interface + * alarm_clock_get - posix clock_get_timespec interface * @which_clock: clockid * @tp: timespec to fill. * @@ -837,7 +837,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, const struct k_clock alarm_clock = { .clock_getres = alarm_clock_getres, - .clock_get = alarm_clock_get, + .clock_get_timespec = alarm_clock_get, .timer_create = alarm_timer_create, .timer_set = common_timer_set, .timer_del = common_timer_del, diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 200fb2d3be99..77c0c2370b6d 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -310,8 +310,8 @@ out: } const struct k_clock clock_posix_dynamic = { - .clock_getres = pc_clock_getres, - .clock_set = pc_clock_settime, - .clock_get = pc_clock_gettime, - .clock_adj = pc_clock_adjtime, + .clock_getres = pc_clock_getres, + .clock_set = pc_clock_settime, + .clock_get_timespec = pc_clock_gettime, + .clock_adj = pc_clock_adjtime, }; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 42d512fcfda2..8ff6da77a01f 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1391,26 +1391,26 @@ static int thread_cpu_timer_create(struct k_itimer *timer) } const struct k_clock clock_posix_cpu = { - .clock_getres = posix_cpu_clock_getres, - .clock_set = posix_cpu_clock_set, - .clock_get = posix_cpu_clock_get, - .timer_create = posix_cpu_timer_create, - .nsleep = posix_cpu_nsleep, - .timer_set = posix_cpu_timer_set, - .timer_del = posix_cpu_timer_del, - .timer_get = posix_cpu_timer_get, - .timer_rearm = posix_cpu_timer_rearm, + .clock_getres = posix_cpu_clock_getres, + .clock_set = posix_cpu_clock_set, + .clock_get_timespec = posix_cpu_clock_get, + .timer_create = posix_cpu_timer_create, + .nsleep = posix_cpu_nsleep, + .timer_set = posix_cpu_timer_set, + .timer_del = posix_cpu_timer_del, + .timer_get = posix_cpu_timer_get, + .timer_rearm = posix_cpu_timer_rearm, }; const struct k_clock clock_process = { - .clock_getres = process_cpu_clock_getres, - .clock_get = process_cpu_clock_get, - .timer_create = process_cpu_timer_create, - .nsleep = process_cpu_nsleep, + .clock_getres = process_cpu_clock_getres, + .clock_get_timespec = process_cpu_clock_get, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, }; const struct k_clock clock_thread = { - .clock_getres = thread_cpu_clock_getres, - .clock_get = thread_cpu_clock_get, - .timer_create = thread_cpu_timer_create, + .clock_getres = thread_cpu_clock_getres, + .clock_get_timespec = thread_cpu_clock_get, + .timer_create = thread_cpu_timer_create, }; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 0ec5b7a1d769..44d4f9cb782d 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -667,7 +667,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) * The timespec64 based conversion is suboptimal, but it's not * worth to implement yet another callback. */ - kc->clock_get(timr->it_clock, &ts64); + kc->clock_get_timespec(timr->it_clock, &ts64); now = timespec64_to_ktime(ts64); /* @@ -781,7 +781,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the * hood. See hrtimer_init(). Update timr->kclock, so the generic - * functions which use timr->kclock->clock_get() work. + * functions which use timr->kclock->clock_get_timespec() work. * * Note: it_clock stays unmodified, because the next timer_set() might * use ABSTIME, so it needs to switch back. @@ -1067,7 +1067,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, if (!kc) return -EINVAL; - error = kc->clock_get(which_clock, &kernel_tp); + error = kc->clock_get_timespec(which_clock, &kernel_tp); if (!error && put_timespec64(&kernel_tp, tp)) error = -EFAULT; @@ -1149,7 +1149,7 @@ SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, if (!kc) return -EINVAL; - err = kc->clock_get(which_clock, &ts); + err = kc->clock_get_timespec(which_clock, &ts); if (!err && put_old_timespec32(&ts, tp)) err = -EFAULT; @@ -1261,7 +1261,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, static const struct k_clock clock_realtime = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_clock_realtime_get, + .clock_get_timespec = posix_clock_realtime_get, .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, .nsleep = common_nsleep, @@ -1279,7 +1279,7 @@ static const struct k_clock clock_realtime = { static const struct k_clock clock_monotonic = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_ktime_get_ts, + .clock_get_timespec = posix_ktime_get_ts, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, @@ -1295,22 +1295,22 @@ static const struct k_clock clock_monotonic = { static const struct k_clock clock_monotonic_raw = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_monotonic_raw, + .clock_get_timespec = posix_get_monotonic_raw, }; static const struct k_clock clock_realtime_coarse = { .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, + .clock_get_timespec = posix_get_realtime_coarse, }; static const struct k_clock clock_monotonic_coarse = { .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, + .clock_get_timespec = posix_get_monotonic_coarse, }; static const struct k_clock clock_tai = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_tai, + .clock_get_timespec = posix_get_tai, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, @@ -1326,7 +1326,7 @@ static const struct k_clock clock_tai = { static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_boottime, + .clock_get_timespec = posix_get_boottime, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 897c29e162b9..070611b2c253 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -6,8 +6,8 @@ struct k_clock { struct timespec64 *tp); int (*clock_set)(const clockid_t which_clock, const struct timespec64 *tp); - int (*clock_get)(const clockid_t which_clock, - struct timespec64 *tp); + int (*clock_get_timespec)(const clockid_t which_clock, + struct timespec64 *tp); int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx); int (*timer_create)(struct k_itimer *timer); int (*nsleep)(const clockid_t which_clock, int flags, -- cgit v1.2.3 From eaf80194d0fe48be393587541c48a799a9a06a70 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:55 +0000 Subject: posix-clocks: Rename .clock_get_timespec() callbacks accordingly The upcoming support for time namespaces requires to have access to: - The time in a task's time namespace for sys_clock_gettime() - The time in the root name space for common_timer_get() That adds a valid reason to finally implement a separate callback which returns the time in ktime_t format in (struct k_clock). As a preparation ground for introducing clock_get_ktime(), the original callback clock_get() was renamed into clock_get_timespec(). Reflect the renaming into the callback implementations. Suggested-by: Thomas Gleixner Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-7-dima@arista.com --- kernel/time/alarmtimer.c | 6 +++--- kernel/time/posix-timers.c | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8523df726fee..62b06cfa710d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -657,13 +657,13 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp } /** - * alarm_clock_get - posix clock_get_timespec interface + * alarm_clock_get_timespec - posix clock_get_timespec interface * @which_clock: clockid * @tp: timespec to fill. * * Provides the underlying alarm base time. */ -static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp) +static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp) { struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; @@ -837,7 +837,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, const struct k_clock alarm_clock = { .clock_getres = alarm_clock_getres, - .clock_get_timespec = alarm_clock_get, + .clock_get_timespec = alarm_clock_get_timespec, .timer_create = alarm_timer_create, .timer_set = common_timer_set, .timer_del = common_timer_del, diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 44d4f9cb782d..68d4690cc225 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -165,7 +165,7 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) } /* Get clock_realtime */ -static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_real_ts64(tp); return 0; @@ -187,7 +187,7 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, /* * Get monotonic time for posix timers */ -static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_ts64(tp); return 0; @@ -222,13 +222,13 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * return 0; } -static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) +static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp) { ktime_get_boottime_ts64(tp); return 0; } -static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_clocktai_ts64(tp); return 0; @@ -1261,7 +1261,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, static const struct k_clock clock_realtime = { .clock_getres = posix_get_hrtimer_res, - .clock_get_timespec = posix_clock_realtime_get, + .clock_get_timespec = posix_get_realtime_timespec, .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, .nsleep = common_nsleep, @@ -1279,7 +1279,7 @@ static const struct k_clock clock_realtime = { static const struct k_clock clock_monotonic = { .clock_getres = posix_get_hrtimer_res, - .clock_get_timespec = posix_ktime_get_ts, + .clock_get_timespec = posix_get_monotonic_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, @@ -1310,7 +1310,7 @@ static const struct k_clock clock_monotonic_coarse = { static const struct k_clock clock_tai = { .clock_getres = posix_get_hrtimer_res, - .clock_get_timespec = posix_get_tai, + .clock_get_timespec = posix_get_tai_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, @@ -1326,7 +1326,7 @@ static const struct k_clock clock_tai = { static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, - .clock_get_timespec = posix_get_boottime, + .clock_get_timespec = posix_get_boottime_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, -- cgit v1.2.3 From 41b3b8dffc1f84e581addfbc09bec0289db3315e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:56 +0000 Subject: alarmtimer: Rename gettime() callback to get_ktime() The upcoming support for time namespaces requires to have access to: - The time in a tasks time namespace for sys_clock_gettime() - The time in the root name space for common_timer_get() struct alarm_base needs to follow the same naming convention, so rename .gettime() callback into get_ktime() as a preparation for introducing get_timespec(). Suggested-by: Thomas Gleixner Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-8-dima@arista.com --- kernel/time/alarmtimer.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 62b06cfa710d..22b6f9b133b2 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -36,13 +36,13 @@ * struct alarm_base - Alarm timer bases * @lock: Lock for syncrhonized access to the base * @timerqueue: Timerqueue head managing the list of events - * @gettime: Function to read the time correlating to the base + * @get_ktime: Function to read the time correlating to the base * @base_clockid: clockid for the base */ static struct alarm_base { spinlock_t lock; struct timerqueue_head timerqueue; - ktime_t (*gettime)(void); + ktime_t (*get_ktime)(void); clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; @@ -207,7 +207,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) spin_unlock_irqrestore(&base->lock, flags); if (alarm->function) - restart = alarm->function(alarm, base->gettime()); + restart = alarm->function(alarm, base->get_ktime()); spin_lock_irqsave(&base->lock, flags); if (restart != ALARMTIMER_NORESTART) { @@ -217,7 +217,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) } spin_unlock_irqrestore(&base->lock, flags); - trace_alarmtimer_fired(alarm, base->gettime()); + trace_alarmtimer_fired(alarm, base->get_ktime()); return ret; } @@ -225,7 +225,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) ktime_t alarm_expires_remaining(const struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; - return ktime_sub(alarm->node.expires, base->gettime()); + return ktime_sub(alarm->node.expires, base->get_ktime()); } EXPORT_SYMBOL_GPL(alarm_expires_remaining); @@ -270,7 +270,7 @@ static int alarmtimer_suspend(struct device *dev) spin_unlock_irqrestore(&base->lock, flags); if (!next) continue; - delta = ktime_sub(next->expires, base->gettime()); + delta = ktime_sub(next->expires, base->get_ktime()); if (!min || (delta < min)) { expires = next->expires; min = delta; @@ -364,7 +364,7 @@ void alarm_start(struct alarm *alarm, ktime_t start) hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); - trace_alarmtimer_start(alarm, base->gettime()); + trace_alarmtimer_start(alarm, base->get_ktime()); } EXPORT_SYMBOL_GPL(alarm_start); @@ -377,7 +377,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; - start = ktime_add_safe(start, base->gettime()); + start = ktime_add_safe(start, base->get_ktime()); alarm_start(alarm, start); } EXPORT_SYMBOL_GPL(alarm_start_relative); @@ -414,7 +414,7 @@ int alarm_try_to_cancel(struct alarm *alarm) alarmtimer_dequeue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); - trace_alarmtimer_cancel(alarm, base->gettime()); + trace_alarmtimer_cancel(alarm, base->get_ktime()); return ret; } EXPORT_SYMBOL_GPL(alarm_try_to_cancel); @@ -474,7 +474,7 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) { struct alarm_base *base = &alarm_bases[alarm->type]; - return alarm_forward(alarm, base->gettime(), interval); + return alarm_forward(alarm, base->get_ktime(), interval); } EXPORT_SYMBOL_GPL(alarm_forward_now); @@ -500,7 +500,7 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) return; } - delta = ktime_sub(absexp, base->gettime()); + delta = ktime_sub(absexp, base->get_ktime()); spin_lock_irqsave(&freezer_delta_lock, flags); if (!freezer_delta || (delta < freezer_delta)) { @@ -632,7 +632,7 @@ static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires, struct alarm_base *base = &alarm_bases[alarm->type]; if (!absolute) - expires = ktime_add_safe(expires, base->gettime()); + expires = ktime_add_safe(expires, base->get_ktime()); if (sigev_none) alarm->node.expires = expires; else @@ -670,7 +670,7 @@ static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp if (!alarmtimer_get_rtcdev()) return -EINVAL; - *tp = ktime_to_timespec64(base->gettime()); + *tp = ktime_to_timespec64(base->get_ktime()); return 0; } @@ -747,7 +747,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, struct timespec64 rmt; ktime_t rem; - rem = ktime_sub(absexp, alarm_bases[type].gettime()); + rem = ktime_sub(absexp, alarm_bases[type].get_ktime()); if (rem <= 0) return 0; @@ -816,7 +816,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, exp = timespec64_to_ktime(*tsreq); /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { - ktime_t now = alarm_bases[type].gettime(); + ktime_t now = alarm_bases[type].get_ktime(); exp = ktime_add_safe(now, exp); } @@ -882,9 +882,9 @@ static int __init alarmtimer_init(void) /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; - alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; + alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real; alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; - alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime; + alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime; for (i = 0; i < ALARM_NUMTYPE; i++) { timerqueue_init_head(&alarm_bases[i].timerqueue); spin_lock_init(&alarm_bases[i].lock); -- cgit v1.2.3 From 2f58bf909abf9670fa4e848b433dc12ba4c2a44e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:57 +0000 Subject: alarmtimer: Provide get_timespec() callback The upcoming support for time namespaces requires to have access to: - The time in a task's time namespace for sys_clock_gettime() - The time in the root name space for common_timer_get() Wire up alarm bases with get_timespec(). Suggested-by: Thomas Gleixner Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-9-dima@arista.com --- kernel/time/alarmtimer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 22b6f9b133b2..357be1fe6e1f 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -37,12 +37,14 @@ * @lock: Lock for syncrhonized access to the base * @timerqueue: Timerqueue head managing the list of events * @get_ktime: Function to read the time correlating to the base + * @get_timespec: Function to read the namespace time correlating to the base * @base_clockid: clockid for the base */ static struct alarm_base { spinlock_t lock; struct timerqueue_head timerqueue; ktime_t (*get_ktime)(void); + void (*get_timespec)(struct timespec64 *tp); clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; @@ -670,7 +672,8 @@ static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp if (!alarmtimer_get_rtcdev()) return -EINVAL; - *tp = ktime_to_timespec64(base->get_ktime()); + base->get_timespec(tp); + return 0; } @@ -883,8 +886,10 @@ static int __init alarmtimer_init(void) /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real; + alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64, alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime; + alarm_bases[ALARM_BOOTTIME].get_timespec = ktime_get_boottime_ts64; for (i = 0; i < ALARM_NUMTYPE; i++) { timerqueue_init_head(&alarm_bases[i].timerqueue); spin_lock_init(&alarm_bases[i].lock); -- cgit v1.2.3 From 9c71a2e8a757bc6aee256bc97c6fb711144b0a0f Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:58 +0000 Subject: posix-clocks: Introduce clock_get_ktime() callback The callsite in common_timer_get() has already a comment: /* * The timespec64 based conversion is suboptimal, but it's not * worth to implement yet another callback. */ kc->clock_get(timr->it_clock, &ts64); now = timespec64_to_ktime(ts64); The upcoming support for time namespaces requires to have access to: - The time in a task's time namespace for sys_clock_gettime() - The time in the root name space for common_timer_get() That adds a valid reason to finally implement a separate callback which returns the time in ktime_t format. Suggested-by: Thomas Gleixner Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-10-dima@arista.com --- kernel/time/alarmtimer.c | 19 ++++++++++++++++++- kernel/time/posix-timers.c | 26 +++++++++++++++++++++++++- kernel/time/posix-timers.h | 3 +++ 3 files changed, 46 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 357be1fe6e1f..4d8c90546635 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -663,7 +663,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp * @which_clock: clockid * @tp: timespec to fill. * - * Provides the underlying alarm base time. + * Provides the underlying alarm base time in a tasks time namespace. */ static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp) { @@ -677,6 +677,22 @@ static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp return 0; } +/** + * alarm_clock_get_ktime - posix clock_get_ktime interface + * @which_clock: clockid + * + * Provides the underlying alarm base time in the root namespace. + */ +static ktime_t alarm_clock_get_ktime(clockid_t which_clock) +{ + struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; + + if (!alarmtimer_get_rtcdev()) + return -EINVAL; + + return base->get_ktime(); +} + /** * alarm_timer_create - posix timer_create interface * @new_timer: k_itimer pointer to manage @@ -840,6 +856,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, const struct k_clock alarm_clock = { .clock_getres = alarm_clock_getres, + .clock_get_ktime = alarm_clock_get_ktime, .clock_get_timespec = alarm_clock_get_timespec, .timer_create = alarm_timer_create, .timer_set = common_timer_set, diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 68d4690cc225..a1f6b968c5d8 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -171,6 +171,11 @@ static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 return 0; } +static ktime_t posix_get_realtime_ktime(clockid_t which_clock) +{ + return ktime_get_real(); +} + /* Set clock_realtime */ static int posix_clock_realtime_set(const clockid_t which_clock, const struct timespec64 *tp) @@ -193,6 +198,11 @@ static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 return 0; } +static ktime_t posix_get_monotonic_ktime(clockid_t which_clock) +{ + return ktime_get(); +} + /* * Get monotonic-raw time for posix timers */ @@ -228,12 +238,22 @@ static int posix_get_boottime_timespec(const clockid_t which_clock, struct times return 0; } +static ktime_t posix_get_boottime_ktime(const clockid_t which_clock) +{ + return ktime_get_boottime(); +} + static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_clocktai_ts64(tp); return 0; } +static ktime_t posix_get_tai_ktime(clockid_t which_clock) +{ + return ktime_get_clocktai(); +} + static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) { tp->tv_sec = 0; @@ -781,7 +801,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the * hood. See hrtimer_init(). Update timr->kclock, so the generic - * functions which use timr->kclock->clock_get_timespec() work. + * functions which use timr->kclock->clock_get_*() work. * * Note: it_clock stays unmodified, because the next timer_set() might * use ABSTIME, so it needs to switch back. @@ -1262,6 +1282,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, static const struct k_clock clock_realtime = { .clock_getres = posix_get_hrtimer_res, .clock_get_timespec = posix_get_realtime_timespec, + .clock_get_ktime = posix_get_realtime_ktime, .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, .nsleep = common_nsleep, @@ -1280,6 +1301,7 @@ static const struct k_clock clock_realtime = { static const struct k_clock clock_monotonic = { .clock_getres = posix_get_hrtimer_res, .clock_get_timespec = posix_get_monotonic_timespec, + .clock_get_ktime = posix_get_monotonic_ktime, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, @@ -1310,6 +1332,7 @@ static const struct k_clock clock_monotonic_coarse = { static const struct k_clock clock_tai = { .clock_getres = posix_get_hrtimer_res, + .clock_get_ktime = posix_get_tai_ktime, .clock_get_timespec = posix_get_tai_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, @@ -1326,6 +1349,7 @@ static const struct k_clock clock_tai = { static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, + .clock_get_ktime = posix_get_boottime_ktime, .clock_get_timespec = posix_get_boottime_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 070611b2c253..f32a2ebba9b8 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -6,8 +6,11 @@ struct k_clock { struct timespec64 *tp); int (*clock_set)(const clockid_t which_clock, const struct timespec64 *tp); + /* Returns the clock value in the current time namespace. */ int (*clock_get_timespec)(const clockid_t which_clock, struct timespec64 *tp); + /* Returns the clock value in the root time namespace. */ + ktime_t (*clock_get_ktime)(const clockid_t which_clock); int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx); int (*timer_create)(struct k_itimer *timer); int (*nsleep)(const clockid_t which_clock, int flags, -- cgit v1.2.3 From 198fa445d5c4c1a1c6c1d39f962559f8d008e79d Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:59 +0000 Subject: posix-timers: Use clock_get_ktime() in common_timer_get() Now, when the clock_get_ktime() callback exists, the suboptimal timespec64-based conversion can be removed from common_timer_get(). Suggested-by: Thomas Gleixner Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-11-dima@arista.com --- kernel/time/posix-timers.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index a1f6b968c5d8..fe1de4f71ace 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -665,7 +665,6 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { const struct k_clock *kc = timr->kclock; ktime_t now, remaining, iv; - struct timespec64 ts64; bool sig_none; sig_none = timr->it_sigev_notify == SIGEV_NONE; @@ -683,12 +682,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) return; } - /* - * The timespec64 based conversion is suboptimal, but it's not - * worth to implement yet another callback. - */ - kc->clock_get_timespec(timr->it_clock, &ts64); - now = timespec64_to_ktime(ts64); + now = kc->clock_get_ktime(timr->it_clock); /* * When a requeue is pending or this is a SIGEV_NONE timer move the -- cgit v1.2.3 From 5a590f35add93c2bdf3ed83eee73111021679562 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:00 +0000 Subject: posix-clocks: Wire up clock_gettime() with timens offsets Adjust monotonic and boottime clocks with per-timens offsets. As the result a process inside time namespace will see timers and clocks corrected to offsets that were set when the namespace was created Note that applications usually go through vDSO to get time, which is not yet adjusted. Further changes will complete time namespace virtualisation with vDSO support. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-12-dima@arista.com --- kernel/time/alarmtimer.c | 9 ++++++++- kernel/time/posix-stubs.c | 3 +++ kernel/time/posix-timers.c | 5 +++++ 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 4d8c90546635..9a8e81bc4ec2 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "posix-timers.h" @@ -886,6 +887,12 @@ static struct platform_driver alarmtimer_driver = { } }; +static void get_boottime_timespec(struct timespec64 *tp) +{ + ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); +} + /** * alarmtimer_init - Initialize alarm timer code * @@ -906,7 +913,7 @@ static int __init alarmtimer_init(void) alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64, alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime; - alarm_bases[ALARM_BOOTTIME].get_timespec = ktime_get_boottime_ts64; + alarm_bases[ALARM_BOOTTIME].get_timespec = get_boottime_timespec; for (i = 0; i < ALARM_NUMTYPE; i++) { timerqueue_init_head(&alarm_bases[i].timerqueue); spin_lock_init(&alarm_bases[i].lock); diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 20c65a7d4e3a..bcbaa2045f5e 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER @@ -77,9 +78,11 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) break; case CLOCK_MONOTONIC: ktime_get_ts64(tp); + timens_add_monotonic(tp); break; case CLOCK_BOOTTIME: ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); break; default: return -EINVAL; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index fe1de4f71ace..d26b915b227a 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "timekeeping.h" #include "posix-timers.h" @@ -195,6 +196,7 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_ts64(tp); + timens_add_monotonic(tp); return 0; } @@ -209,6 +211,7 @@ static ktime_t posix_get_monotonic_ktime(clockid_t which_clock) static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) { ktime_get_raw_ts64(tp); + timens_add_monotonic(tp); return 0; } @@ -223,6 +226,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock, struct timespec64 *tp) { ktime_get_coarse_ts64(tp); + timens_add_monotonic(tp); return 0; } @@ -235,6 +239,7 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp) { ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); return 0; } -- cgit v1.2.3 From 89dd8eecfe961fab4924dcd14f80cf2ab2820044 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:01 +0000 Subject: time: Add do_timens_ktime_to_host() helper The helper subtracts namespace's clock offset from the given time and ensures that the result is within [0, KTIME_MAX]. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-13-dima@arista.com --- kernel/time/namespace.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'kernel') diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index c2a58e45fc4b..1a0fbaa5d2d4 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -16,6 +16,42 @@ #include #include +ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, + struct timens_offsets *ns_offsets) +{ + ktime_t offset; + + switch (clockid) { + case CLOCK_MONOTONIC: + offset = timespec64_to_ktime(ns_offsets->monotonic); + break; + case CLOCK_BOOTTIME: + case CLOCK_BOOTTIME_ALARM: + offset = timespec64_to_ktime(ns_offsets->boottime); + break; + default: + return tim; + } + + /* + * Check that @tim value is in [offset, KTIME_MAX + offset] + * and subtract offset. + */ + if (tim < offset) { + /* + * User can specify @tim *absolute* value - if it's lesser than + * the time namespace's offset - it's already expired. + */ + tim = 0; + } else { + tim = ktime_sub(tim, offset); + if (unlikely(tim > KTIME_MAX)) + tim = KTIME_MAX; + } + + return tim; +} + static struct ucounts *inc_time_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); -- cgit v1.2.3 From 7da8b3a44bb426a43670b3a97ed61085018a9d43 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:03 +0000 Subject: posix-timers: Make timer_settime() time namespace aware Wire timer_settime() syscall into time namespace virtualization. sys_timer_settime() calls the ktime->timer_set() callback. Right now, common_timer_set() is the only implementation for the callback. The user-supplied expiry value is converted from timespec64 to ktime and then timens_ktime_to_host() can be used to convert namespace's time to the host time. Inside a time namespace kernel's time differs by a fixed offset from a user-supplied time, but only absolute values (TIMER_ABSTIME) must be converted. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-15-dima@arista.com --- kernel/time/posix-timers.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index d26b915b227a..473082b0b57f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -885,6 +885,8 @@ int common_timer_set(struct k_itimer *timr, int flags, timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); + if (flags & TIMER_ABSTIME) + expires = timens_ktime_to_host(timr->it_clock, expires); sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); -- cgit v1.2.3 From 0b9b9a3b162e85e620e3598f1badc45b8a177492 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:04 +0000 Subject: alarmtimer: Make nanosleep() time namespace aware clock_nanosleep() accepts absolute values of expiration time when the TIMER_ABSTIME flag is set. This absolute value is inside the task's time namespace and has to be converted to the host's time. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-16-dima@arista.com --- kernel/time/alarmtimer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 9a8e81bc4ec2..b51b36e533c4 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -839,6 +839,8 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, ktime_t now = alarm_bases[type].get_ktime(); exp = ktime_add_safe(now, exp); + } else { + exp = timens_ktime_to_host(which_clock, exp); } ret = alarmtimer_do_nsleep(&alarm, exp, type); -- cgit v1.2.3 From ea2d1f7fce0f18b67f915c00c6a7a6860116bc92 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:05 +0000 Subject: hrtimers: Prepare hrtimer_nanosleep() for time namespaces clock_nanosleep() accepts absolute values of expiration time when TIMER_ABSTIME flag is set. This absolute value is inside the task's time namespace, and has to be converted to the host's time. There is timens_ktime_to_host() helper for converting time, but it accepts ktime argument. As a preparation, make hrtimer_nanosleep() accept a clock value in ktime instead of timespec64. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-17-dima@arista.com --- kernel/time/hrtimer.c | 12 +++++++----- kernel/time/posix-stubs.c | 4 ++-- kernel/time/posix-timers.c | 4 +++- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 8de90ea31280..d8b62f93fc8d 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1910,8 +1910,8 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) return ret; } -long hrtimer_nanosleep(const struct timespec64 *rqtp, - const enum hrtimer_mode mode, const clockid_t clockid) +long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, + const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; @@ -1923,7 +1923,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp, slack = 0; hrtimer_init_sleeper_on_stack(&t, clockid, mode); - hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); + hrtimer_set_expires_range_ns(&t.timer, rqtp, slack); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; @@ -1958,7 +1958,8 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, + CLOCK_MONOTONIC); } #endif @@ -1978,7 +1979,8 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, + CLOCK_MONOTONIC); } #endif diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index bcbaa2045f5e..5745a138f254 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -147,7 +147,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } @@ -236,7 +236,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 473082b0b57f..75fee6e39e5a 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1221,7 +1221,9 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, static int common_nsleep(const clockid_t which_clock, int flags, const struct timespec64 *rqtp) { - return hrtimer_nanosleep(rqtp, flags & TIMER_ABSTIME ? + ktime_t texp = timespec64_to_ktime(*rqtp); + + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } -- cgit v1.2.3 From 1f9b37bfbb607a09d838c248843e63a2cafe1080 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:06 +0000 Subject: posix-timers: Make clock_nanosleep() time namespace aware clock_nanosleep() accepts absolute values of expiration time, if the TIMER_ABSTIME flag is set. This value is in the tasks time namespace, which has to be converted to the host time namespace. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-18-dima@arista.com --- kernel/time/posix-stubs.c | 12 ++++++++++-- kernel/time/posix-timers.c | 17 +++++++++++++++-- 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 5745a138f254..fcb3b21d8bdc 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -129,6 +129,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, struct __kernel_timespec __user *, rmtp) { struct timespec64 t; + ktime_t texp; switch (which_clock) { case CLOCK_REALTIME: @@ -147,7 +148,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ? + texp = timespec64_to_ktime(t); + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(which_clock, texp); + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } @@ -218,6 +222,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, struct old_timespec32 __user *, rmtp) { struct timespec64 t; + ktime_t texp; switch (which_clock) { case CLOCK_REALTIME: @@ -236,7 +241,10 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ? + texp = timespec64_to_ktime(t); + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(which_clock, texp); + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 75fee6e39e5a..ff0eb30de346 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1228,6 +1228,19 @@ static int common_nsleep(const clockid_t which_clock, int flags, which_clock); } +static int common_nsleep_timens(const clockid_t which_clock, int flags, + const struct timespec64 *rqtp) +{ + ktime_t texp = timespec64_to_ktime(*rqtp); + + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(which_clock, texp); + + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); +} + SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct __kernel_timespec __user *, rqtp, struct __kernel_timespec __user *, rmtp) @@ -1305,7 +1318,7 @@ static const struct k_clock clock_monotonic = { .clock_getres = posix_get_hrtimer_res, .clock_get_timespec = posix_get_monotonic_timespec, .clock_get_ktime = posix_get_monotonic_ktime, - .nsleep = common_nsleep, + .nsleep = common_nsleep_timens, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, @@ -1354,7 +1367,7 @@ static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, .clock_get_ktime = posix_get_boottime_ktime, .clock_get_timespec = posix_get_boottime_timespec, - .nsleep = common_nsleep, + .nsleep = common_nsleep_timens, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, -- cgit v1.2.3 From afaa7b5ac7c87479fb5a626f87d2157af30d6401 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 12 Nov 2019 01:27:12 +0000 Subject: time: Allocate per-timens vvar page VDSO support for Time namespace needs to set up a page with the same layout as VVAR. That timens page will be placed on position of VVAR page inside namespace. That page contains time namespace clock offsets and it has vdso_data->seq set to 1 to enforce the slow path and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time namespace handling path. Allocate the timens page during namespace creation. Setup the offsets when the first task enters the ns and freeze them to guarantee the pace of monotonic/boottime clocks and to avoid breakage of applications. The design decision is to have a global offset_lock which is used during namespace offsets setup and to freeze offsets when the first task joins the new time namespace. That is better in terms of memory usage compared to having a per namespace mutex that's used only during the setup period. Suggested-by: Andy Lutomirski Based-on-work-by: Thomas Gleixner Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-24-dima@arista.com --- kernel/time/namespace.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 1a0fbaa5d2d4..d705c15d0273 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -16,6 +16,8 @@ #include #include +#include + ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, struct timens_offsets *ns_offsets) { @@ -90,16 +92,23 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, kref_init(&ns->kref); + ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!ns->vvar_page) + goto fail_free; + err = ns_alloc_inum(&ns->ns); if (err) - goto fail_free; + goto fail_free_page; ns->ucounts = ucounts; ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; + ns->frozen_offsets = false; return ns; +fail_free_page: + __free_page(ns->vvar_page); fail_free: kfree(ns); fail_dec: @@ -128,6 +137,93 @@ struct time_namespace *copy_time_ns(unsigned long flags, return clone_time_ns(user_ns, old_ns); } +static struct timens_offset offset_from_ts(struct timespec64 off) +{ + struct timens_offset ret; + + ret.sec = off.tv_sec; + ret.nsec = off.tv_nsec; + + return ret; +} + +/* + * A time namespace VVAR page has the same layout as the VVAR page which + * contains the system wide VDSO data. + * + * For a normal task the VVAR pages are installed in the normal ordering: + * VVAR + * PVCLOCK + * HVCLOCK + * TIMENS <- Not really required + * + * Now for a timens task the pages are installed in the following order: + * TIMENS + * PVCLOCK + * HVCLOCK + * VVAR + * + * The check for vdso_data->clock_mode is in the unlikely path of + * the seq begin magic. So for the non-timens case most of the time + * 'seq' is even, so the branch is not taken. + * + * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check + * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the + * update to finish and for 'seq' to become even anyway. + * + * Timens page has vdso_data->clock_mode set to VCLOCK_TIMENS which enforces + * the time namespace handling path. + */ +static void timens_setup_vdso_data(struct vdso_data *vdata, + struct time_namespace *ns) +{ + struct timens_offset *offset = vdata->offset; + struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); + struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); + + vdata->seq = 1; + vdata->clock_mode = VCLOCK_TIMENS; + offset[CLOCK_MONOTONIC] = monotonic; + offset[CLOCK_MONOTONIC_RAW] = monotonic; + offset[CLOCK_MONOTONIC_COARSE] = monotonic; + offset[CLOCK_BOOTTIME] = boottime; + offset[CLOCK_BOOTTIME_ALARM] = boottime; +} + +/* + * Protects possibly multiple offsets writers racing each other + * and tasks entering the namespace. + */ +static DEFINE_MUTEX(offset_lock); + +static void timens_set_vvar_page(struct task_struct *task, + struct time_namespace *ns) +{ + struct vdso_data *vdata; + unsigned int i; + + if (ns == &init_time_ns) + return; + + /* Fast-path, taken by every task in namespace except the first. */ + if (likely(ns->frozen_offsets)) + return; + + mutex_lock(&offset_lock); + /* Nothing to-do: vvar_page has been already initialized. */ + if (ns->frozen_offsets) + goto out; + + ns->frozen_offsets = true; + vdata = arch_get_vdso_data(page_address(ns->vvar_page)); + + for (i = 0; i < CS_BASES; i++) + timens_setup_vdso_data(&vdata[i], ns); + +out: + mutex_unlock(&offset_lock); +} + void free_time_ns(struct kref *kref) { struct time_namespace *ns; @@ -136,6 +232,7 @@ void free_time_ns(struct kref *kref) dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); + __free_page(ns->vvar_page); kfree(ns); } @@ -192,6 +289,8 @@ static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; + timens_set_vvar_page(current, ns); + get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; @@ -211,6 +310,8 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) if (nsproxy->time_ns == nsproxy->time_ns_for_children) return 0; + timens_set_vvar_page(tsk, ns); + get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; @@ -246,6 +347,7 @@ struct time_namespace init_time_ns = { .user_ns = &init_user_ns, .ns.inum = PROC_TIME_INIT_INO, .ns.ops = &timens_operations, + .frozen_offsets = true, }; static int __init time_ns_init(void) -- cgit v1.2.3 From 70ddf65184ec1e8989322f35193e4fde7377f0cc Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 12 Nov 2019 01:27:15 +0000 Subject: x86/vdso: Zap vvar pages when switching to a time namespace The VVAR page layout depends on whether a task belongs to the root or non-root time namespace. Whenever a task changes its namespace, the VVAR page tables are cleared and then they will be re-faulted with a corresponding layout. Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-27-dima@arista.com --- kernel/time/namespace.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index d705c15d0273..0732964803b9 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -281,6 +281,7 @@ static void timens_put(struct ns_common *ns) static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) { struct time_namespace *ns = to_time_ns(new); + int err; if (!current_is_single_threaded()) return -EUSERS; @@ -291,6 +292,10 @@ static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) timens_set_vvar_page(current, ns); + err = vdso_join_timens(current, ns); + if (err) + return err; + get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; @@ -305,6 +310,7 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; struct time_namespace *ns = to_time_ns(nsc); + int err; /* create_new_namespaces() already incremented the ref counter */ if (nsproxy->time_ns == nsproxy->time_ns_for_children) @@ -312,6 +318,10 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) timens_set_vvar_page(tsk, ns); + err = vdso_join_timens(tsk, ns); + if (err) + return err; + get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; -- cgit v1.2.3 From 04a8682a71becdb639ec9c0d82b315a2baef7a5d Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:16 +0000 Subject: fs/proc: Introduce /proc/pid/timens_offsets API to set time namespace offsets for children processes, i.e.: echo "$clockid $offset_sec $offset_nsec" > /proc/self/timens_offsets Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-28-dima@arista.com --- kernel/time/namespace.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) (limited to 'kernel') diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 0732964803b9..12858507d75a 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -334,6 +335,106 @@ static struct user_namespace *timens_owner(struct ns_common *ns) return to_time_ns(ns)->user_ns; } +static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) +{ + seq_printf(m, "%d %lld %ld\n", clockid, ts->tv_sec, ts->tv_nsec); +} + +void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) +{ + struct ns_common *ns; + struct time_namespace *time_ns; + + ns = timens_for_children_get(p); + if (!ns) + return; + time_ns = to_time_ns(ns); + + show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); + show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); + put_time_ns(time_ns); +} + +int proc_timens_set_offset(struct file *file, struct task_struct *p, + struct proc_timens_offset *offsets, int noffsets) +{ + struct ns_common *ns; + struct time_namespace *time_ns; + struct timespec64 tp; + int i, err; + + ns = timens_for_children_get(p); + if (!ns) + return -ESRCH; + time_ns = to_time_ns(ns); + + if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { + put_time_ns(time_ns); + return -EPERM; + } + + for (i = 0; i < noffsets; i++) { + struct proc_timens_offset *off = &offsets[i]; + + switch (off->clockid) { + case CLOCK_MONOTONIC: + ktime_get_ts64(&tp); + break; + case CLOCK_BOOTTIME: + ktime_get_boottime_ts64(&tp); + break; + default: + err = -EINVAL; + goto out; + } + + err = -ERANGE; + + if (off->val.tv_sec > KTIME_SEC_MAX || + off->val.tv_sec < -KTIME_SEC_MAX) + goto out; + + tp = timespec64_add(tp, off->val); + /* + * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is + * still unreachable. + */ + if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) + goto out; + } + + mutex_lock(&offset_lock); + if (time_ns->frozen_offsets) { + err = -EACCES; + goto out_unlock; + } + + err = 0; + /* Don't report errors after this line */ + for (i = 0; i < noffsets; i++) { + struct proc_timens_offset *off = &offsets[i]; + struct timespec64 *offset = NULL; + + switch (off->clockid) { + case CLOCK_MONOTONIC: + offset = &time_ns->offsets.monotonic; + break; + case CLOCK_BOOTTIME: + offset = &time_ns->offsets.boottime; + break; + } + + *offset = off->val; + } + +out_unlock: + mutex_unlock(&offset_lock); +out: + put_time_ns(time_ns); + + return err; +} + const struct proc_ns_operations timens_operations = { .name = "time", .type = CLONE_NEWTIME, -- cgit v1.2.3 From 984cfe4e252681d516df056b982e3c47b66fba92 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 18 Dec 2019 13:40:35 -0400 Subject: mm/mmu_notifier: Rename struct mmu_notifier_mm to mmu_notifier_subscriptions The name mmu_notifier_mm implies that the thing is a mm_struct pointer, and is difficult to abbreviate. The struct is actually holding the interval tree and hlist containing the notifiers subscribed to a mm. Use 'subscriptions' as the variable name for this struct instead of the really terrible and misleading 'mmn_mm'. Signed-off-by: Jason Gunthorpe --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 2508a4f238a3..047865086cdf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -692,7 +692,7 @@ void __mmdrop(struct mm_struct *mm) WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); destroy_context(mm); - mmu_notifier_mm_destroy(mm); + mmu_notifier_subscriptions_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); free_mm(mm); @@ -1025,7 +1025,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_aio(mm); mm_init_owner(mm, p); RCU_INIT_POINTER(mm->exe_file, NULL); - mmu_notifier_mm_init(mm); + mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; -- cgit v1.2.3 From 8379bb84be757d5df2d818509faec5d66adb861d Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 16:06:14 +0000 Subject: keys: Fix request_key() cache When the key cached by request_key() and co. is cleaned up on exit(), the code looks in the wrong task_struct, and so clears the wrong cache. This leads to anomalies in key refcounting when doing, say, a kernel build on an afs volume, that then trigger kasan to report a use-after-free when the key is viewed in /proc/keys. Fix this by making exit_creds() look in the passed-in task_struct rather than in current (the task_struct cleanup code is deferred by RCU and potentially run in another task). Fixes: 7743c48e54ee ("keys: Cache result of request_key*() temporarily in task_struct") Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/cred.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index c0a4c12d38b2..56395be1c2a8 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -175,8 +175,8 @@ void exit_creds(struct task_struct *tsk) put_cred(cred); #ifdef CONFIG_KEYS_REQUEST_CACHE - key_put(current->cached_requested_key); - current->cached_requested_key = NULL; + key_put(tsk->cached_requested_key); + tsk->cached_requested_key = NULL; #endif } -- cgit v1.2.3 From 3b4130418f62b0e7a4685cc2c03bb41c6cb8922d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 13 Jan 2020 23:26:47 -0800 Subject: bpf: Fix seq_show for BPF_MAP_TYPE_STRUCT_OPS Instead of using bpf_struct_ops_map_lookup_elem() which is not implemented, bpf_struct_ops_map_seq_show_elem() should also use bpf_struct_ops_map_sys_lookup_elem() which does an inplace update to the value. The change allocates a value to pass to bpf_struct_ops_map_sys_lookup_elem(). [root@arch-fb-vm1 bpf]# cat /sys/fs/bpf/dctcp {{{1}},BPF_STRUCT_OPS_STATE_INUSE,{{00000000df93eebc,00000000df93eebc},0,2, ... Fixes: 85d33df357b6 ("bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS") Reported-by: Dan Carpenter Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200114072647.3188298-1-kafai@fb.com --- kernel/bpf/bpf_struct_ops.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index ddf48f49914b..8ad1c9ea26b2 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -496,14 +496,20 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { void *value; + int err; - value = bpf_struct_ops_map_lookup_elem(map, key); + value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); if (!value) return; - btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id, - value, m); - seq_puts(m, "\n"); + err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); + if (!err) { + btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id, + value, m); + seq_puts(m, "\n"); + } + + kfree(value); } static void bpf_struct_ops_map_free(struct bpf_map *map) -- cgit v1.2.3 From 99c9a923e97a583a38050baa92c9377d73946330 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 10 Jan 2020 10:45:39 +0900 Subject: tracing/uprobe: Fix double perf_event linking on multiprobe uprobe Fix double perf_event linking to trace_uprobe_filter on multiple uprobe event by moving trace_uprobe_filter under trace_probe_event. In uprobe perf event, trace_uprobe_filter data structure is managing target mm filters (in perf_event) related to each uprobe event. Since commit 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe") left the trace_uprobe_filter data structure in trace_uprobe, if a trace_probe_event has multiple trace_uprobe (multi-probe event), a perf_event is added to different trace_uprobe_filter on each trace_uprobe. This leads a linked list corruption. To fix this issue, move trace_uprobe_filter to trace_probe_event and link it once on each event instead of each probe. Link: http://lkml.kernel.org/r/157862073931.1800.3800576241181489174.stgit@devnote2 Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Ingo Molnar Cc: "Naveen N . Rao" Cc: Anil S Keshavamurthy Cc: "David S . Miller" Cc: Namhyung Kim Cc: =?utf-8?q?Toke_H=C3=B8iland-J?= =?utf-8?b?w7hyZ2Vuc2Vu?= Cc: Jean-Tsung Hsiao Cc: Jesper Dangaard Brouer Cc: stable@vger.kernel.org Fixes: 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe") Link: https://lkml.kernel.org/r/20200108171611.GA8472@kernel.org Reported-by: Arnaldo Carvalho de Melo Tested-by: Arnaldo Carvalho de Melo Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_probe.c | 5 +- kernel/trace/trace_probe.h | 3 +- kernel/trace/trace_uprobe.c | 124 ++++++++++++++++++++++++++++---------------- 4 files changed, 86 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7f890262c8a3..3e5f9c7d939c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -290,7 +290,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, INIT_HLIST_NODE(&tk->rp.kp.hlist); INIT_LIST_HEAD(&tk->rp.kp.list); - ret = trace_probe_init(&tk->tp, event, group); + ret = trace_probe_init(&tk->tp, event, group, 0); if (ret < 0) goto error; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 905b10af5d5c..bba18cf44a30 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -984,7 +984,7 @@ void trace_probe_cleanup(struct trace_probe *tp) } int trace_probe_init(struct trace_probe *tp, const char *event, - const char *group) + const char *group, size_t event_data_size) { struct trace_event_call *call; int ret = 0; @@ -992,7 +992,8 @@ int trace_probe_init(struct trace_probe *tp, const char *event, if (!event || !group) return -EINVAL; - tp->event = kzalloc(sizeof(struct trace_probe_event), GFP_KERNEL); + tp->event = kzalloc(sizeof(struct trace_probe_event) + event_data_size, + GFP_KERNEL); if (!tp->event) return -ENOMEM; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 4ee703728aec..03e4e180058d 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -230,6 +230,7 @@ struct trace_probe_event { struct trace_event_call call; struct list_head files; struct list_head probes; + char data[0]; }; struct trace_probe { @@ -322,7 +323,7 @@ static inline bool trace_probe_has_single_file(struct trace_probe *tp) } int trace_probe_init(struct trace_probe *tp, const char *event, - const char *group); + const char *group, size_t event_data_size); void trace_probe_cleanup(struct trace_probe *tp); int trace_probe_append(struct trace_probe *tp, struct trace_probe *to); void trace_probe_unlink(struct trace_probe *tp); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 352073d36585..f66e202fec13 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -60,7 +60,6 @@ static struct dyn_event_operations trace_uprobe_ops = { */ struct trace_uprobe { struct dyn_event devent; - struct trace_uprobe_filter filter; struct uprobe_consumer consumer; struct path path; struct inode *inode; @@ -264,6 +263,14 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, } NOKPROBE_SYMBOL(process_fetch_insn) +static struct trace_uprobe_filter * +trace_uprobe_get_filter(struct trace_uprobe *tu) +{ + struct trace_probe_event *event = tu->tp.event; + + return (struct trace_uprobe_filter *)&event->data[0]; +} + static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) { rwlock_init(&filter->rwlock); @@ -351,7 +358,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (!tu) return ERR_PTR(-ENOMEM); - ret = trace_probe_init(&tu->tp, event, group); + ret = trace_probe_init(&tu->tp, event, group, + sizeof(struct trace_uprobe_filter)); if (ret < 0) goto error; @@ -359,7 +367,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) tu->consumer.handler = uprobe_dispatcher; if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; - init_trace_uprobe_filter(&tu->filter); + init_trace_uprobe_filter(trace_uprobe_get_filter(tu)); return tu; error: @@ -1067,13 +1075,14 @@ static void __probe_event_disable(struct trace_probe *tp) struct trace_probe *pos; struct trace_uprobe *tu; + tu = container_of(tp, struct trace_uprobe, tp); + WARN_ON(!uprobe_filter_is_empty(trace_uprobe_get_filter(tu))); + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { tu = container_of(pos, struct trace_uprobe, tp); if (!tu->inode) continue; - WARN_ON(!uprobe_filter_is_empty(&tu->filter)); - uprobe_unregister(tu->inode, tu->offset, &tu->consumer); tu->inode = NULL; } @@ -1108,7 +1117,7 @@ static int probe_event_enable(struct trace_event_call *call, } tu = container_of(tp, struct trace_uprobe, tp); - WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + WARN_ON(!uprobe_filter_is_empty(trace_uprobe_get_filter(tu))); if (enabled) return 0; @@ -1205,39 +1214,39 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) } static inline bool -uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) +trace_uprobe_filter_event(struct trace_uprobe_filter *filter, + struct perf_event *event) { - return __uprobe_perf_filter(&tu->filter, event->hw.target->mm); + return __uprobe_perf_filter(filter, event->hw.target->mm); } -static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +static bool trace_uprobe_filter_remove(struct trace_uprobe_filter *filter, + struct perf_event *event) { bool done; - write_lock(&tu->filter.rwlock); + write_lock(&filter->rwlock); if (event->hw.target) { list_del(&event->hw.tp_list); - done = tu->filter.nr_systemwide || + done = filter->nr_systemwide || (event->hw.target->flags & PF_EXITING) || - uprobe_filter_event(tu, event); + trace_uprobe_filter_event(filter, event); } else { - tu->filter.nr_systemwide--; - done = tu->filter.nr_systemwide; + filter->nr_systemwide--; + done = filter->nr_systemwide; } - write_unlock(&tu->filter.rwlock); - - if (!done) - return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + write_unlock(&filter->rwlock); - return 0; + return done; } -static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +/* This returns true if the filter always covers target mm */ +static bool trace_uprobe_filter_add(struct trace_uprobe_filter *filter, + struct perf_event *event) { bool done; - int err; - write_lock(&tu->filter.rwlock); + write_lock(&filter->rwlock); if (event->hw.target) { /* * event->parent != NULL means copy_process(), we can avoid @@ -1247,28 +1256,21 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) * attr.enable_on_exec means that exec/mmap will install the * breakpoints we need. */ - done = tu->filter.nr_systemwide || + done = filter->nr_systemwide || event->parent || event->attr.enable_on_exec || - uprobe_filter_event(tu, event); - list_add(&event->hw.tp_list, &tu->filter.perf_events); + trace_uprobe_filter_event(filter, event); + list_add(&event->hw.tp_list, &filter->perf_events); } else { - done = tu->filter.nr_systemwide; - tu->filter.nr_systemwide++; + done = filter->nr_systemwide; + filter->nr_systemwide++; } - write_unlock(&tu->filter.rwlock); + write_unlock(&filter->rwlock); - err = 0; - if (!done) { - err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); - if (err) - uprobe_perf_close(tu, event); - } - return err; + return done; } -static int uprobe_perf_multi_call(struct trace_event_call *call, - struct perf_event *event, - int (*op)(struct trace_uprobe *tu, struct perf_event *event)) +static int uprobe_perf_close(struct trace_event_call *call, + struct perf_event *event) { struct trace_probe *pos, *tp; struct trace_uprobe *tu; @@ -1278,25 +1280,59 @@ static int uprobe_perf_multi_call(struct trace_event_call *call, if (WARN_ON_ONCE(!tp)) return -ENODEV; + tu = container_of(tp, struct trace_uprobe, tp); + if (trace_uprobe_filter_remove(trace_uprobe_get_filter(tu), event)) + return 0; + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { tu = container_of(pos, struct trace_uprobe, tp); - ret = op(tu, event); + ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); if (ret) break; } return ret; } + +static int uprobe_perf_open(struct trace_event_call *call, + struct perf_event *event) +{ + struct trace_probe *pos, *tp; + struct trace_uprobe *tu; + int err = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + tu = container_of(tp, struct trace_uprobe, tp); + if (trace_uprobe_filter_add(trace_uprobe_get_filter(tu), event)) + return 0; + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + if (err) { + uprobe_perf_close(call, event); + break; + } + } + + return err; +} + static bool uprobe_perf_filter(struct uprobe_consumer *uc, enum uprobe_filter_ctx ctx, struct mm_struct *mm) { + struct trace_uprobe_filter *filter; struct trace_uprobe *tu; int ret; tu = container_of(uc, struct trace_uprobe, consumer); - read_lock(&tu->filter.rwlock); - ret = __uprobe_perf_filter(&tu->filter, mm); - read_unlock(&tu->filter.rwlock); + filter = trace_uprobe_get_filter(tu); + + read_lock(&filter->rwlock); + ret = __uprobe_perf_filter(filter, mm); + read_unlock(&filter->rwlock); return ret; } @@ -1419,10 +1455,10 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type, return 0; case TRACE_REG_PERF_OPEN: - return uprobe_perf_multi_call(event, data, uprobe_perf_open); + return uprobe_perf_open(event, data); case TRACE_REG_PERF_CLOSE: - return uprobe_perf_multi_call(event, data, uprobe_perf_close); + return uprobe_perf_close(event, data); #endif default: -- cgit v1.2.3 From 59e7cffe5cca6f15c21d394492fa4739172de1c5 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 5 Jun 2014 20:22:05 +0200 Subject: ring-bufer: kernel-doc warning fixes Also fixes a couple of typos Link: http://lkml.kernel.org/r/1401992525-10417-1-git-send-email-fabf@skynet.be Cc: Andrew Morton Signed-off-by: Fabian Frederick [ Found this deep in the abyss of my INBOX ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f846de2aa435..46d67ff68795 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1368,6 +1368,7 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) * __ring_buffer_alloc - allocate a new ring_buffer * @size: the size in bytes per cpu that is needed. * @flags: attributes to set for the ring buffer. + * @key: ring buffer reader_lock_key. * * Currently the only flag that is available is the RB_FL_OVERWRITE * flag. This flag means that the buffer will overwrite old data @@ -4331,6 +4332,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read); /** * ring_buffer_size - return the size of the ring buffer (in bytes) * @buffer: The ring buffer. + * @cpu: The CPU to get ring buffer size from. */ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) { @@ -4504,6 +4506,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers * @buffer_a: One buffer to swap with * @buffer_b: The other buffer to swap with + * @cpu: the CPU of the buffers to swap * * This function is useful for tracers that want to take a "snapshot" * of a CPU buffer and has another back up buffer lying around. -- cgit v1.2.3 From cfc585a401764f0d352602d614c19866bb84738a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 14 Jan 2020 16:27:51 -0500 Subject: ring-buffer: Fix kernel doc for rb_update_event() rb_update_event has changed without the kernel-doc update. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 46d67ff68795..3bab9b0a90b6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2331,11 +2331,11 @@ static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, /** * rb_update_event - update event type and data + * @cpu_buffer: The per cpu buffer of the @event * @event: the event to update - * @type: the type of event - * @length: the size of the event field in the ring buffer + * @info: The info to update the @event with (contains length and delta) * - * Update the type and data fields of the event. The length + * Update the type and data fields of the @event. The length * is the actual size that is written to the ring buffer, * and with this, we can determine what to place into the * data field. -- cgit v1.2.3 From aeed8aa3874dc15b9d82a6fe796fd7cfbb684448 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 20 Dec 2019 11:31:43 +0900 Subject: tracing: trigger: Replace unneeded RCU-list traversals With CONFIG_PROVE_RCU_LIST, I had many suspicious RCU warnings when I ran ftracetest trigger testcases. ----- # dmesg -c > /dev/null # ./ftracetest test.d/trigger ... # dmesg | grep "RCU-list traversed" | cut -f 2 -d ] | cut -f 2 -d " " kernel/trace/trace_events_hist.c:6070 kernel/trace/trace_events_hist.c:1760 kernel/trace/trace_events_hist.c:5911 kernel/trace/trace_events_trigger.c:504 kernel/trace/trace_events_hist.c:1810 kernel/trace/trace_events_hist.c:3158 kernel/trace/trace_events_hist.c:3105 kernel/trace/trace_events_hist.c:5518 kernel/trace/trace_events_hist.c:5998 kernel/trace/trace_events_hist.c:6019 kernel/trace/trace_events_hist.c:6044 kernel/trace/trace_events_trigger.c:1500 kernel/trace/trace_events_trigger.c:1540 kernel/trace/trace_events_trigger.c:539 kernel/trace/trace_events_trigger.c:584 ----- I investigated those warnings and found that the RCU-list traversals in event trigger and hist didn't need to use RCU version because those were called only under event_mutex. I also checked other RCU-list traversals related to event trigger list, and found that most of them were called from event_hist_trigger_func() or hist_unregister_trigger() or register/unregister functions except for a few cases. Replace these unneeded RCU-list traversals with normal list traversal macro and lockdep_assert_held() to check the event_mutex is held. Link: http://lkml.kernel.org/r/157680910305.11685.15110237954275915782.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: 30350d65ac567 ("tracing: Add variable support to hist triggers") Reviewed-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 41 +++++++++++++++++++++++++++---------- kernel/trace/trace_events_trigger.c | 20 +++++++++++++----- 2 files changed, 45 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f62de5f43e79..d33b046f985a 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1766,11 +1766,13 @@ static struct hist_field *find_var(struct hist_trigger_data *hist_data, struct event_trigger_data *test; struct hist_field *hist_field; + lockdep_assert_held(&event_mutex); + hist_field = find_var_field(hist_data, var_name); if (hist_field) return hist_field; - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { test_data = test->private_data; hist_field = find_var_field(test_data, var_name); @@ -1820,7 +1822,9 @@ static struct hist_field *find_file_var(struct trace_event_file *file, struct event_trigger_data *test; struct hist_field *hist_field; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { test_data = test->private_data; hist_field = find_var_field(test_data, var_name); @@ -3115,7 +3119,9 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data, { struct event_trigger_data *test; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (test->private_data == hist_data) return test->filter_str; @@ -3166,9 +3172,11 @@ find_compatible_hist(struct hist_trigger_data *target_hist_data, struct event_trigger_data *test; unsigned int n_keys; + lockdep_assert_held(&event_mutex); + n_keys = target_hist_data->n_fields - target_hist_data->n_vals; - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { hist_data = test->private_data; @@ -5528,7 +5536,7 @@ static int hist_show(struct seq_file *m, void *v) goto out_unlock; } - list_for_each_entry_rcu(data, &event_file->triggers, list) { + list_for_each_entry(data, &event_file->triggers, list) { if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) hist_trigger_show(m, data, n++); } @@ -5921,7 +5929,9 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, if (hist_data->attrs->name && !named_data) goto new; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (!hist_trigger_match(data, test, named_data, false)) continue; @@ -6005,10 +6015,12 @@ static bool have_hist_trigger_match(struct event_trigger_data *data, struct event_trigger_data *test, *named_data = NULL; bool match = false; + lockdep_assert_held(&event_mutex); + if (hist_data->attrs->name) named_data = find_named_trigger(hist_data->attrs->name); - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (hist_trigger_match(data, test, named_data, false)) { match = true; @@ -6026,10 +6038,12 @@ static bool hist_trigger_check_refs(struct event_trigger_data *data, struct hist_trigger_data *hist_data = data->private_data; struct event_trigger_data *test, *named_data = NULL; + lockdep_assert_held(&event_mutex); + if (hist_data->attrs->name) named_data = find_named_trigger(hist_data->attrs->name); - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (!hist_trigger_match(data, test, named_data, false)) continue; @@ -6051,10 +6065,12 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *test, *named_data = NULL; bool unregistered = false; + lockdep_assert_held(&event_mutex); + if (hist_data->attrs->name) named_data = find_named_trigger(hist_data->attrs->name); - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (!hist_trigger_match(data, test, named_data, false)) continue; @@ -6080,7 +6096,9 @@ static bool hist_file_check_refs(struct trace_event_file *file) struct hist_trigger_data *hist_data; struct event_trigger_data *test; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { hist_data = test->private_data; if (check_var_refs(hist_data)) @@ -6323,7 +6341,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec, struct enable_trigger_data *enable_data = data->private_data; struct event_trigger_data *test; - list_for_each_entry_rcu(test, &enable_data->file->triggers, list) { + list_for_each_entry_rcu(test, &enable_data->file->triggers, list, + lockdep_is_held(&event_mutex)) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (enable_data->enable) test->paused = false; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 2cd53ca21b51..40106fff06a4 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -501,7 +501,9 @@ void update_cond_flag(struct trace_event_file *file) struct event_trigger_data *data; bool set_cond = false; - list_for_each_entry_rcu(data, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(data, &file->triggers, list) { if (data->filter || event_command_post_trigger(data->cmd_ops) || event_command_needs_rec(data->cmd_ops)) { set_cond = true; @@ -536,7 +538,9 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *test; int ret = 0; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) { ret = -EEXIST; goto out; @@ -581,7 +585,9 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data; bool unregistered = false; - list_for_each_entry_rcu(data, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(data, &file->triggers, list) { if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { unregistered = true; list_del_rcu(&data->list); @@ -1497,7 +1503,9 @@ int event_enable_register_trigger(char *glob, struct event_trigger_data *test; int ret = 0; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { test_enable_data = test->private_data; if (test_enable_data && (test->cmd_ops->trigger_type == @@ -1537,7 +1545,9 @@ void event_enable_unregister_trigger(char *glob, struct event_trigger_data *data; bool unregistered = false; - list_for_each_entry_rcu(data, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(data, &file->triggers, list) { enable_data = data->private_data; if (enable_data && (data->cmd_ops->trigger_type == -- cgit v1.2.3 From de95a991bb72e009f47e0c4bbc90fc5f594588d5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Dec 2019 20:56:19 -0800 Subject: tick/sched: Annotate lockless access to last_jiffies_update syzbot (KCSAN) reported a data-race in tick_do_update_jiffies64(): BUG: KCSAN: data-race in tick_do_update_jiffies64 / tick_do_update_jiffies64 write to 0xffffffff8603d008 of 8 bytes by interrupt on cpu 1: tick_do_update_jiffies64+0x100/0x250 kernel/time/tick-sched.c:73 tick_sched_do_timer+0xd4/0xe0 kernel/time/tick-sched.c:138 tick_sched_timer+0x43/0xe0 kernel/time/tick-sched.c:1292 __run_hrtimer kernel/time/hrtimer.c:1514 [inline] __hrtimer_run_queues+0x274/0x5f0 kernel/time/hrtimer.c:1576 hrtimer_interrupt+0x22a/0x480 kernel/time/hrtimer.c:1638 local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1110 [inline] smp_apic_timer_interrupt+0xdc/0x280 arch/x86/kernel/apic/apic.c:1135 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830 arch_local_irq_restore arch/x86/include/asm/paravirt.h:756 [inline] kcsan_setup_watchpoint+0x1d4/0x460 kernel/kcsan/core.c:436 check_access kernel/kcsan/core.c:466 [inline] __tsan_read1 kernel/kcsan/core.c:593 [inline] __tsan_read1+0xc2/0x100 kernel/kcsan/core.c:593 kallsyms_expand_symbol.constprop.0+0x70/0x160 kernel/kallsyms.c:79 kallsyms_lookup_name+0x7f/0x120 kernel/kallsyms.c:170 insert_report_filterlist kernel/kcsan/debugfs.c:155 [inline] debugfs_write+0x14b/0x2d0 kernel/kcsan/debugfs.c:256 full_proxy_write+0xbd/0x100 fs/debugfs/file.c:225 __vfs_write+0x67/0xc0 fs/read_write.c:494 vfs_write fs/read_write.c:558 [inline] vfs_write+0x18a/0x390 fs/read_write.c:542 ksys_write+0xd5/0x1b0 fs/read_write.c:611 __do_sys_write fs/read_write.c:623 [inline] __se_sys_write fs/read_write.c:620 [inline] __x64_sys_write+0x4c/0x60 fs/read_write.c:620 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 read to 0xffffffff8603d008 of 8 bytes by task 0 on cpu 0: tick_do_update_jiffies64+0x2b/0x250 kernel/time/tick-sched.c:62 tick_nohz_update_jiffies kernel/time/tick-sched.c:505 [inline] tick_nohz_irq_enter kernel/time/tick-sched.c:1257 [inline] tick_irq_enter+0x139/0x1c0 kernel/time/tick-sched.c:1274 irq_enter+0x4f/0x60 kernel/softirq.c:354 entering_irq arch/x86/include/asm/apic.h:517 [inline] entering_ack_irq arch/x86/include/asm/apic.h:523 [inline] smp_apic_timer_interrupt+0x55/0x280 arch/x86/kernel/apic/apic.c:1133 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830 native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:60 arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:571 default_idle_call+0x1e/0x40 kernel/sched/idle.c:94 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x1af/0x280 kernel/sched/idle.c:263 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:355 rest_init+0xec/0xf6 init/main.c:452 arch_call_rest_init+0x17/0x37 start_kernel+0x838/0x85e init/main.c:786 x86_64_start_reservations+0x29/0x2b arch/x86/kernel/head64.c:490 x86_64_start_kernel+0x72/0x76 arch/x86/kernel/head64.c:471 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.4.0-rc7+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Use READ_ONCE() and WRITE_ONCE() to annotate this expected race. Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191205045619.204946-1-edumazet@google.com --- kernel/time/tick-sched.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 8b192e67aabc..a792d21cac64 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -58,8 +58,9 @@ static void tick_do_update_jiffies64(ktime_t now) /* * Do a quick check without holding jiffies_lock: + * The READ_ONCE() pairs with two updates done later in this function. */ - delta = ktime_sub(now, last_jiffies_update); + delta = ktime_sub(now, READ_ONCE(last_jiffies_update)); if (delta < tick_period) return; @@ -70,8 +71,9 @@ static void tick_do_update_jiffies64(ktime_t now) if (delta >= tick_period) { delta = ktime_sub(delta, tick_period); - last_jiffies_update = ktime_add(last_jiffies_update, - tick_period); + /* Pairs with the lockless read in this function. */ + WRITE_ONCE(last_jiffies_update, + ktime_add(last_jiffies_update, tick_period)); /* Slow path for long timeouts */ if (unlikely(delta >= tick_period)) { @@ -79,8 +81,10 @@ static void tick_do_update_jiffies64(ktime_t now) ticks = ktime_divns(delta, incr); - last_jiffies_update = ktime_add_ns(last_jiffies_update, - incr * ticks); + /* Pairs with the lockless read in this function. */ + WRITE_ONCE(last_jiffies_update, + ktime_add_ns(last_jiffies_update, + incr * ticks)); } do_timer(++ticks); -- cgit v1.2.3 From 6b6d188aae79a630957aefd88ff5c42af6553ee3 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 9 Jan 2020 07:59:07 -0800 Subject: alarmtimer: Unregister wakeup source when module get fails The alarmtimer_rtc_add_device() function creates a wakeup source and then tries to grab a module reference. If that fails the function returns early with an error code, but fails to remove the wakeup source. Cleanup this exit path so there is no dangling wakeup source, which is named 'alarmtime' left allocated which will conflict with another RTC device that may be registered later. Fixes: 51218298a25e ("alarmtimer: Ensure RTC module is not unloaded") Signed-off-by: Stephen Boyd Signed-off-by: Thomas Gleixner Reviewed-by: Douglas Anderson Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200109155910.907-2-swboyd@chromium.org --- kernel/time/alarmtimer.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index b51b36e533c4..9dc7a0913190 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -91,6 +91,7 @@ static int alarmtimer_rtc_add_device(struct device *dev, unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); struct wakeup_source *__ws; + int ret = 0; if (rtcdev) return -EBUSY; @@ -105,8 +106,8 @@ static int alarmtimer_rtc_add_device(struct device *dev, spin_lock_irqsave(&rtcdev_lock, flags); if (!rtcdev) { if (!try_module_get(rtc->owner)) { - spin_unlock_irqrestore(&rtcdev_lock, flags); - return -1; + ret = -1; + goto unlock; } rtcdev = rtc; @@ -115,11 +116,12 @@ static int alarmtimer_rtc_add_device(struct device *dev, ws = __ws; __ws = NULL; } +unlock: spin_unlock_irqrestore(&rtcdev_lock, flags); wakeup_source_unregister(__ws); - return 0; + return ret; } static inline void alarmtimer_rtc_timer_init(void) -- cgit v1.2.3 From e9f35f634e099894f4d6c3b039cd3de5281ee637 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Wed, 15 Jan 2020 15:49:31 +0100 Subject: modsign: print module name along with error message It is useful to know which module failed signature verification, so print the module name along with the error message. Signed-off-by: Jessica Yu --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d4d876172e9d..2b5f9c4748bc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2882,7 +2882,7 @@ static int module_sig_check(struct load_info *info, int flags) reason = "Loading of module with unavailable key"; decide: if (is_module_sig_enforced()) { - pr_notice("%s is rejected\n", reason); + pr_notice("%s: %s is rejected\n", info->name, reason); return -EKEYREJECTED; } -- cgit v1.2.3 From 75ea91cd3eab8214eb8cd7f61ac23f41884d29ec Mon Sep 17 00:00:00 2001 From: Chen Zhou Date: Fri, 10 Jan 2020 15:23:20 +0800 Subject: cgroup: fix function name in comment Function name cgroup_rstat_cpu_pop_upated() in comment should be cgroup_rstat_cpu_pop_updated(). Signed-off-by: Chen Zhou Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index b48b22d4deb6..6f87352f8219 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -33,7 +33,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) return; /* - * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we + * Paired with the one in cgroup_rstat_cpu_pop_updated(). Either we * see NULL updated_next or they see our updated stat. */ smp_mb(); -- cgit v1.2.3 From 1c5da0ec7f20dfb56030fb93f7f52f48e12deb52 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Mon, 13 Jan 2020 17:52:39 -0500 Subject: workqueue: add worker function to workqueue_execute_end tracepoint It's surprising that workqueue_execute_end includes only the work when its counterpart workqueue_execute_start has both the work and the worker function. You can't set a tracing filter or trigger based on the function, and postprocessing scripts interested in specific functions are harder to write since they have to remember the work from _start and match it up with the same field in _end. Add the function name, taking care to use the copy stashed in the worker since the work is no longer safe to touch. Signed-off-by: Daniel Jordan Cc: Tejun Heo Cc: Lai Jiangshan Cc: linux-kernel@vger.kernel.org Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cfc923558e04..4bdfa270a37b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2266,7 +2266,7 @@ __acquires(&pool->lock) * While we must be careful to not use "work" after this, the trace * point will only record its address. */ - trace_workqueue_execute_end(work); + trace_workqueue_execute_end(work, worker->current_func); lock_map_release(&lockdep_map); lock_map_release(&pwq->wq->lockdep_map); -- cgit v1.2.3 From 3bc0bb36fa30e95ca829e9cf480e1ef7f7638333 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Thu, 9 Jan 2020 16:05:59 +0100 Subject: cgroup: Prevent double killing of css when enabling threaded cgroup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test_cgcore_no_internal_process_constraint_on_threads selftest when running with subsystem controlling noise triggers two warnings: > [ 597.443115] WARNING: CPU: 1 PID: 28167 at kernel/cgroup/cgroup.c:3131 cgroup_apply_control_enable+0xe0/0x3f0 > [ 597.443413] WARNING: CPU: 1 PID: 28167 at kernel/cgroup/cgroup.c:3177 cgroup_apply_control_disable+0xa6/0x160 Both stem from a call to cgroup_type_write. The first warning was also triggered by syzkaller. When we're switching cgroup to threaded mode shortly after a subsystem was disabled on it, we can see the respective subsystem css dying there. The warning in cgroup_apply_control_enable is harmless in this case since we're not adding new subsys anyway. The warning in cgroup_apply_control_disable indicates an attempt to kill css of recently disabled subsystem repeatedly. The commit prevents these situations by making cgroup_type_write wait for all dying csses to go away before re-applying subtree controls. When at it, the locations of WARN_ON_ONCE calls are moved so that warning is triggered only when we are about to misuse the dying css. Reported-by: syzbot+5493b2a54d31d6aea629@syzkaller.appspotmail.com Reported-by: Christian Brauner Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 735af8f15f95..1e12e6928bca 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3055,8 +3055,6 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); - WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); - if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) continue; @@ -3066,6 +3064,8 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) return PTR_ERR(css); } + WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); + if (css_visible(css)) { ret = css_populate_dir(css); if (ret) @@ -3101,11 +3101,11 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); - WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); - if (!css) continue; + WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); + if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { kill_css(css); @@ -3392,7 +3392,8 @@ static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf, if (strcmp(strstrip(buf), "threaded")) return -EINVAL; - cgrp = cgroup_kn_lock_live(of->kn, false); + /* drain dying csses before we re-apply (threaded) subtree control */ + cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENOENT; -- cgit v1.2.3 From 8482941f09067da42f9c3362e15bfb3f3c19d610 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 14 Jan 2020 19:50:02 -0800 Subject: bpf: Add bpf_send_signal_thread() helper Commit 8b401f9ed244 ("bpf: implement bpf_send_signal() helper") added helper bpf_send_signal() which permits bpf program to send a signal to the current process. The signal may be delivered to any threads in the process. We found a use case where sending the signal to the current thread is more preferable. - A bpf program will collect the stack trace and then send signal to the user application. - The user application will add some thread specific information to the just collected stack trace for later analysis. If bpf_send_signal() is used, user application will need to check whether the thread receiving the signal matches the thread collecting the stack by checking thread id. If not, it will need to send signal to another thread through pthread_kill(). This patch proposed a new helper bpf_send_signal_thread(), which sends the signal to the thread corresponding to the current kernel task. This way, user space is guaranteed that bpf_program execution context and user space signal handling context are the same thread. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200115035002.602336-1-yhs@fb.com --- kernel/trace/bpf_trace.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e5ef4ae9edb5..19e793aa441a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -703,6 +703,7 @@ struct send_signal_irq_work { struct irq_work irq_work; struct task_struct *task; u32 sig; + enum pid_type type; }; static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); @@ -712,10 +713,10 @@ static void do_bpf_send_signal(struct irq_work *entry) struct send_signal_irq_work *work; work = container_of(entry, struct send_signal_irq_work, irq_work); - group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID); + group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type); } -BPF_CALL_1(bpf_send_signal, u32, sig) +static int bpf_send_signal_common(u32 sig, enum pid_type type) { struct send_signal_irq_work *work = NULL; @@ -748,11 +749,17 @@ BPF_CALL_1(bpf_send_signal, u32, sig) */ work->task = current; work->sig = sig; + work->type = type; irq_work_queue(&work->irq_work); return 0; } - return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID); + return group_send_sig_info(sig, SEND_SIG_PRIV, current, type); +} + +BPF_CALL_1(bpf_send_signal, u32, sig) +{ + return bpf_send_signal_common(sig, PIDTYPE_TGID); } static const struct bpf_func_proto bpf_send_signal_proto = { @@ -762,6 +769,18 @@ static const struct bpf_func_proto bpf_send_signal_proto = { .arg1_type = ARG_ANYTHING, }; +BPF_CALL_1(bpf_send_signal_thread, u32, sig) +{ + return bpf_send_signal_common(sig, PIDTYPE_PID); +} + +static const struct bpf_func_proto bpf_send_signal_thread_proto = { + .func = bpf_send_signal_thread, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -822,6 +841,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif case BPF_FUNC_send_signal: return &bpf_send_signal_proto; + case BPF_FUNC_send_signal_thread: + return &bpf_send_signal_thread_proto; default: return NULL; } -- cgit v1.2.3 From 5167c506d62dd9ffab73eba23c79b0a8845c9fe1 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Fri, 10 Jan 2020 16:39:02 +0800 Subject: tick/common: Touch watchdog in tick_unfreeze() on all CPUs Suspend to IDLE invokes tick_unfreeze() on resume. tick_unfreeze() on the first resuming CPU resumes timekeeping, which also has the side effect of resetting the softlockup watchdog on this CPU. But on the secondary CPUs the watchdog is not reset in the resume / unfreeze() path, which can result in false softlockup warnings on those CPUs depending on the time spent in suspend. Prevent this by clearing the softlock watchdog in the unfreeze path also on the secondary resuming CPUs. [ tglx: Massaged changelog ] Signed-off-by: Chunyan Zhang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200110083902.27276-1-chunyan.zhang@unisoc.com --- kernel/time/tick-common.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 59225b484e4e..7e5d3524e924 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -558,6 +559,7 @@ void tick_unfreeze(void) trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); } else { + touch_softlockup_watchdog(); tick_resume_local(); } -- cgit v1.2.3 From 0af2ffc93a4b50948f9dad2786b7f1bd253bf0b9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 15 Jan 2020 21:47:33 +0100 Subject: bpf: Fix incorrect verifier simulation of ARSH under ALU32 Anatoly has been fuzzing with kBdysch harness and reported a hang in one of the outcomes: 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (85) call bpf_get_socket_cookie#46 1: R0_w=invP(id=0) R10=fp0 1: (57) r0 &= 808464432 2: R0_w=invP(id=0,umax_value=808464432,var_off=(0x0; 0x30303030)) R10=fp0 2: (14) w0 -= 810299440 3: R0_w=invP(id=0,umax_value=4294967295,var_off=(0xcf800000; 0x3077fff0)) R10=fp0 3: (c4) w0 s>>= 1 4: R0_w=invP(id=0,umin_value=1740636160,umax_value=2147221496,var_off=(0x67c00000; 0x183bfff8)) R10=fp0 4: (76) if w0 s>= 0x30303030 goto pc+216 221: R0_w=invP(id=0,umin_value=1740636160,umax_value=2147221496,var_off=(0x67c00000; 0x183bfff8)) R10=fp0 221: (95) exit processed 6 insns (limit 1000000) [...] Taking a closer look, the program was xlated as follows: # ./bpftool p d x i 12 0: (85) call bpf_get_socket_cookie#7800896 1: (bf) r6 = r0 2: (57) r6 &= 808464432 3: (14) w6 -= 810299440 4: (c4) w6 s>>= 1 5: (76) if w6 s>= 0x30303030 goto pc+216 6: (05) goto pc-1 7: (05) goto pc-1 8: (05) goto pc-1 [...] 220: (05) goto pc-1 221: (05) goto pc-1 222: (95) exit Meaning, the visible effect is very similar to f54c7898ed1c ("bpf: Fix precision tracking for unbounded scalars"), that is, the fall-through branch in the instruction 5 is considered to be never taken given the conclusion from the min/max bounds tracking in w6, and therefore the dead-code sanitation rewrites it as goto pc-1. However, real-life input disagrees with verification analysis since a soft-lockup was observed. The bug sits in the analysis of the ARSH. The definition is that we shift the target register value right by K bits through shifting in copies of its sign bit. In adjust_scalar_min_max_vals(), we do first coerce the register into 32 bit mode, same happens after simulating the operation. However, for the case of simulating the actual ARSH, we don't take the mode into account and act as if it's always 64 bit, but location of sign bit is different: dst_reg->smin_value >>= umin_val; dst_reg->smax_value >>= umin_val; dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val); Consider an unknown R0 where bpf_get_socket_cookie() (or others) would for example return 0xffff. With the above ARSH simulation, we'd see the following results: [...] 1: R1=ctx(id=0,off=0,imm=0) R2_w=invP65535 R10=fp0 1: (85) call bpf_get_socket_cookie#46 2: R0_w=invP(id=0) R10=fp0 2: (57) r0 &= 808464432 -> R0_runtime = 0x3030 3: R0_w=invP(id=0,umax_value=808464432,var_off=(0x0; 0x30303030)) R10=fp0 3: (14) w0 -= 810299440 -> R0_runtime = 0xcfb40000 4: R0_w=invP(id=0,umax_value=4294967295,var_off=(0xcf800000; 0x3077fff0)) R10=fp0 (0xffffffff) 4: (c4) w0 s>>= 1 -> R0_runtime = 0xe7da0000 5: R0_w=invP(id=0,umin_value=1740636160,umax_value=2147221496,var_off=(0x67c00000; 0x183bfff8)) R10=fp0 (0x67c00000) (0x7ffbfff8) [...] In insn 3, we have a runtime value of 0xcfb40000, which is '1100 1111 1011 0100 0000 0000 0000 0000', the result after the shift has 0xe7da0000 that is '1110 0111 1101 1010 0000 0000 0000 0000', where the sign bit is correctly retained in 32 bit mode. In insn4, the umax was 0xffffffff, and changed into 0x7ffbfff8 after the shift, that is, '0111 1111 1111 1011 1111 1111 1111 1000' and means here that the simulation didn't retain the sign bit. With above logic, the updates happen on the 64 bit min/max bounds and given we coerced the register, the sign bits of the bounds are cleared as well, meaning, we need to force the simulation into s32 space for 32 bit alu mode. Verification after the fix below. We're first analyzing the fall-through branch on 32 bit signed >= test eventually leading to rejection of the program in this specific case: 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (b7) r2 = 808464432 1: R1=ctx(id=0,off=0,imm=0) R2_w=invP808464432 R10=fp0 1: (85) call bpf_get_socket_cookie#46 2: R0_w=invP(id=0) R10=fp0 2: (bf) r6 = r0 3: R0_w=invP(id=0) R6_w=invP(id=0) R10=fp0 3: (57) r6 &= 808464432 4: R0_w=invP(id=0) R6_w=invP(id=0,umax_value=808464432,var_off=(0x0; 0x30303030)) R10=fp0 4: (14) w6 -= 810299440 5: R0_w=invP(id=0) R6_w=invP(id=0,umax_value=4294967295,var_off=(0xcf800000; 0x3077fff0)) R10=fp0 5: (c4) w6 s>>= 1 6: R0_w=invP(id=0) R6_w=invP(id=0,umin_value=3888119808,umax_value=4294705144,var_off=(0xe7c00000; 0x183bfff8)) R10=fp0 (0x67c00000) (0xfffbfff8) 6: (76) if w6 s>= 0x30303030 goto pc+216 7: R0_w=invP(id=0) R6_w=invP(id=0,umin_value=3888119808,umax_value=4294705144,var_off=(0xe7c00000; 0x183bfff8)) R10=fp0 7: (30) r0 = *(u8 *)skb[808464432] BPF_LD_[ABS|IND] uses reserved fields processed 8 insns (limit 1000000) [...] Fixes: 9cbe1f5a32dc ("bpf/verifier: improve register value range tracking with ARSH") Reported-by: Anatoly Trosinenko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200115204733.16648-1-daniel@iogearbox.net --- kernel/bpf/tnum.c | 9 +++++++-- kernel/bpf/verifier.c | 13 ++++++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index ca52b9642943..d4f335a9a899 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -44,14 +44,19 @@ struct tnum tnum_rshift(struct tnum a, u8 shift) return TNUM(a.value >> shift, a.mask >> shift); } -struct tnum tnum_arshift(struct tnum a, u8 min_shift) +struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness) { /* if a.value is negative, arithmetic shifting by minimum shift * will have larger negative offset compared to more shifting. * If a.value is nonnegative, arithmetic shifting by minimum shift * will have larger positive offset compare to more shifting. */ - return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift); + if (insn_bitness == 32) + return TNUM((u32)(((s32)a.value) >> min_shift), + (u32)(((s32)a.mask) >> min_shift)); + else + return TNUM((s64)a.value >> min_shift, + (s64)a.mask >> min_shift); } struct tnum tnum_add(struct tnum a, struct tnum b) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ce85e7041f0c..7d530ce8719d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5049,9 +5049,16 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Upon reaching here, src_known is true and * umax_val is equal to umin_val. */ - dst_reg->smin_value >>= umin_val; - dst_reg->smax_value >>= umin_val; - dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val); + if (insn_bitness == 32) { + dst_reg->smin_value = (u32)(((s32)dst_reg->smin_value) >> umin_val); + dst_reg->smax_value = (u32)(((s32)dst_reg->smax_value) >> umin_val); + } else { + dst_reg->smin_value >>= umin_val; + dst_reg->smax_value >>= umin_val; + } + + dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, + insn_bitness); /* blow away the dst_reg umin_value/umax_value and rely on * dst_reg var_off to refine the result. -- cgit v1.2.3 From 15c14a3dca421f086c187155afba3222b879472d Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Wed, 15 Jan 2020 10:43:00 -0800 Subject: bpf: Add bpf_map_{value_size, update_value, map_copy_value} functions This commit moves reusable code from map_lookup_elem and map_update_elem to avoid code duplication in kernel/bpf/syscall.c. Signed-off-by: Brian Vazquez Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200115184308.162644-2-brianvv@google.com --- kernel/bpf/syscall.c | 280 ++++++++++++++++++++++++++++----------------------- 1 file changed, 152 insertions(+), 128 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f9db72a96ec0..08b0b6e40454 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -129,6 +129,154 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } +static u32 bpf_map_value_size(struct bpf_map *map) +{ + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) + return round_up(map->value_size, 8) * num_possible_cpus(); + else if (IS_FD_MAP(map)) + return sizeof(u32); + else + return map->value_size; +} + +static void maybe_wait_bpf_programs(struct bpf_map *map) +{ + /* Wait for any running BPF programs to complete so that + * userspace, when we return to it, knows that all programs + * that could be running use the new map value. + */ + if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || + map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) + synchronize_rcu(); +} + +static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, + void *value, __u64 flags) +{ + int err; + + /* Need to create a kthread, thus must support schedule */ + if (bpf_map_is_dev_bound(map)) { + return bpf_map_offload_update_elem(map, key, value, flags); + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH || + map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + return map->ops->map_update_elem(map, key, value, flags); + } else if (IS_FD_PROG_ARRAY(map)) { + return bpf_fd_array_map_update_elem(map, f.file, key, value, + flags); + } + + /* must increment bpf_prog_active to avoid kprobe+bpf triggering from + * inside bpf map update or delete otherwise deadlocks are possible + */ + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + err = bpf_percpu_hash_update(map, key, value, flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_update(map, key, value, flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + err = bpf_percpu_cgroup_storage_update(map, key, value, + flags); + } else if (IS_FD_ARRAY(map)) { + rcu_read_lock(); + err = bpf_fd_array_map_update_elem(map, f.file, key, value, + flags); + rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + rcu_read_lock(); + err = bpf_fd_htab_map_update_elem(map, f.file, key, value, + flags); + rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + /* rcu_read_lock() is not needed */ + err = bpf_fd_reuseport_array_update_elem(map, key, value, + flags); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_push_elem(map, value, flags); + } else { + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, flags); + rcu_read_unlock(); + } + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + maybe_wait_bpf_programs(map); + + return err; +} + +static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, + __u64 flags) +{ + void *ptr; + int err; + + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_lookup_elem(map, key, value); + return err; + } + + preempt_disable(); + this_cpu_inc(bpf_prog_active); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + err = bpf_percpu_hash_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + err = bpf_percpu_cgroup_storage_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { + err = bpf_stackmap_copy(map, key, value); + } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { + err = bpf_fd_array_map_lookup_elem(map, key, value); + } else if (IS_FD_HASH(map)) { + err = bpf_fd_htab_map_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + err = bpf_fd_reuseport_array_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_peek_elem(map, value); + } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + /* struct_ops map requires directly updating "value" */ + err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); + } else { + rcu_read_lock(); + if (map->ops->map_lookup_elem_sys_only) + ptr = map->ops->map_lookup_elem_sys_only(map, key); + else + ptr = map->ops->map_lookup_elem(map, key); + if (IS_ERR(ptr)) { + err = PTR_ERR(ptr); + } else if (!ptr) { + err = -ENOENT; + } else { + err = 0; + if (flags & BPF_F_LOCK) + /* lock 'ptr' and copy everything but lock */ + copy_map_value_locked(map, value, ptr, true); + else + copy_map_value(map, value, ptr); + /* mask lock, since value wasn't zero inited */ + check_and_init_map_lock(map, value); + } + rcu_read_unlock(); + } + + this_cpu_dec(bpf_prog_active); + preempt_enable(); + maybe_wait_bpf_programs(map); + + return err; +} + static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) { /* We really just want to fail instead of triggering OOM killer @@ -827,7 +975,7 @@ static int map_lookup_elem(union bpf_attr *attr) void __user *uvalue = u64_to_user_ptr(attr->value); int ufd = attr->map_fd; struct bpf_map *map; - void *key, *value, *ptr; + void *key, *value; u32 value_size; struct fd f; int err; @@ -859,75 +1007,14 @@ static int map_lookup_elem(union bpf_attr *attr) goto err_put; } - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) - value_size = round_up(map->value_size, 8) * num_possible_cpus(); - else if (IS_FD_MAP(map)) - value_size = sizeof(u32); - else - value_size = map->value_size; + value_size = bpf_map_value_size(map); err = -ENOMEM; value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; - if (bpf_map_is_dev_bound(map)) { - err = bpf_map_offload_lookup_elem(map, key, value); - goto done; - } - - preempt_disable(); - this_cpu_inc(bpf_prog_active); - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { - err = bpf_percpu_hash_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - err = bpf_percpu_array_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { - err = bpf_percpu_cgroup_storage_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { - err = bpf_stackmap_copy(map, key, value); - } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { - err = bpf_fd_array_map_lookup_elem(map, key, value); - } else if (IS_FD_HASH(map)) { - err = bpf_fd_htab_map_lookup_elem(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { - err = bpf_fd_reuseport_array_lookup_elem(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_QUEUE || - map->map_type == BPF_MAP_TYPE_STACK) { - err = map->ops->map_peek_elem(map, value); - } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { - /* struct_ops map requires directly updating "value" */ - err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); - } else { - rcu_read_lock(); - if (map->ops->map_lookup_elem_sys_only) - ptr = map->ops->map_lookup_elem_sys_only(map, key); - else - ptr = map->ops->map_lookup_elem(map, key); - if (IS_ERR(ptr)) { - err = PTR_ERR(ptr); - } else if (!ptr) { - err = -ENOENT; - } else { - err = 0; - if (attr->flags & BPF_F_LOCK) - /* lock 'ptr' and copy everything but lock */ - copy_map_value_locked(map, value, ptr, true); - else - copy_map_value(map, value, ptr); - /* mask lock, since value wasn't zero inited */ - check_and_init_map_lock(map, value); - } - rcu_read_unlock(); - } - this_cpu_dec(bpf_prog_active); - preempt_enable(); - -done: + err = bpf_map_copy_value(map, key, value, attr->flags); if (err) goto free_value; @@ -946,16 +1033,6 @@ err_put: return err; } -static void maybe_wait_bpf_programs(struct bpf_map *map) -{ - /* Wait for any running BPF programs to complete so that - * userspace, when we return to it, knows that all programs - * that could be running use the new map value. - */ - if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || - map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) - synchronize_rcu(); -} #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags @@ -1011,61 +1088,8 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; - /* Need to create a kthread, thus must support schedule */ - if (bpf_map_is_dev_bound(map)) { - err = bpf_map_offload_update_elem(map, key, value, attr->flags); - goto out; - } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || - map->map_type == BPF_MAP_TYPE_SOCKHASH || - map->map_type == BPF_MAP_TYPE_SOCKMAP || - map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { - err = map->ops->map_update_elem(map, key, value, attr->flags); - goto out; - } else if (IS_FD_PROG_ARRAY(map)) { - err = bpf_fd_array_map_update_elem(map, f.file, key, value, - attr->flags); - goto out; - } + err = bpf_map_update_value(map, f, key, value, attr->flags); - /* must increment bpf_prog_active to avoid kprobe+bpf triggering from - * inside bpf map update or delete otherwise deadlocks are possible - */ - preempt_disable(); - __this_cpu_inc(bpf_prog_active); - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { - err = bpf_percpu_hash_update(map, key, value, attr->flags); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - err = bpf_percpu_array_update(map, key, value, attr->flags); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { - err = bpf_percpu_cgroup_storage_update(map, key, value, - attr->flags); - } else if (IS_FD_ARRAY(map)) { - rcu_read_lock(); - err = bpf_fd_array_map_update_elem(map, f.file, key, value, - attr->flags); - rcu_read_unlock(); - } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { - rcu_read_lock(); - err = bpf_fd_htab_map_update_elem(map, f.file, key, value, - attr->flags); - rcu_read_unlock(); - } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { - /* rcu_read_lock() is not needed */ - err = bpf_fd_reuseport_array_update_elem(map, key, value, - attr->flags); - } else if (map->map_type == BPF_MAP_TYPE_QUEUE || - map->map_type == BPF_MAP_TYPE_STACK) { - err = map->ops->map_push_elem(map, value, attr->flags); - } else { - rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value, attr->flags); - rcu_read_unlock(); - } - __this_cpu_dec(bpf_prog_active); - preempt_enable(); - maybe_wait_bpf_programs(map); -out: free_value: kfree(value); free_key: -- cgit v1.2.3 From cb4d03ab499d4c040f4ab6fd4389d2b49f42b5a5 Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Wed, 15 Jan 2020 10:43:01 -0800 Subject: bpf: Add generic support for lookup batch op This commit introduces generic support for the bpf_map_lookup_batch. This implementation can be used by almost all the bpf maps since its core implementation is relying on the existing map_get_next_key and map_lookup_elem. The bpf syscall subcommand introduced is: BPF_MAP_LOOKUP_BATCH The UAPI attribute is: struct { /* struct used by BPF_MAP_*_BATCH commands */ __aligned_u64 in_batch; /* start batch, * NULL to start from beginning */ __aligned_u64 out_batch; /* output: next start batch */ __aligned_u64 keys; __aligned_u64 values; __u32 count; /* input/output: * input: # of key/value * elements * output: # of filled elements */ __u32 map_fd; __u64 elem_flags; __u64 flags; } batch; in_batch/out_batch are opaque values use to communicate between user/kernel space, in_batch/out_batch must be of key_size length. To start iterating from the beginning in_batch must be null, count is the # of key/value elements to retrieve. Note that the 'keys' buffer must be a buffer of key_size * count size and the 'values' buffer must be value_size * count, where value_size must be aligned to 8 bytes by userspace if it's dealing with percpu maps. 'count' will contain the number of keys/values successfully retrieved. Note that 'count' is an input/output variable and it can contain a lower value after a call. If there's no more entries to retrieve, ENOENT will be returned. If error is ENOENT, count might be > 0 in case it copied some values but there were no more entries to retrieve. Note that if the return code is an error and not -EFAULT, count indicates the number of elements successfully processed. Suggested-by: Stanislav Fomichev Signed-off-by: Brian Vazquez Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200115184308.162644-3-brianvv@google.com --- kernel/bpf/syscall.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 156 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 08b0b6e40454..d604ddbb1afb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -219,10 +219,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, void *ptr; int err; - if (bpf_map_is_dev_bound(map)) { - err = bpf_map_offload_lookup_elem(map, key, value); - return err; - } + if (bpf_map_is_dev_bound(map)) + return bpf_map_offload_lookup_elem(map, key, value); preempt_disable(); this_cpu_inc(bpf_prog_active); @@ -1220,6 +1218,109 @@ err_put: return err; } +#define MAP_LOOKUP_RETRIES 3 + +int generic_map_lookup_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch); + void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); + void __user *values = u64_to_user_ptr(attr->batch.values); + void __user *keys = u64_to_user_ptr(attr->batch.keys); + void *buf, *buf_prevkey, *prev_key, *key, *value; + int err, retry = MAP_LOOKUP_RETRIES; + u32 value_size, cp, max_count; + bool first_key = false; + + if (attr->batch.elem_flags & ~BPF_F_LOCK) + return -EINVAL; + + if ((attr->batch.elem_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) + return -EINVAL; + + value_size = bpf_map_value_size(map); + + max_count = attr->batch.count; + if (!max_count) + return 0; + + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + + buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + if (!buf_prevkey) + return -ENOMEM; + + buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); + if (!buf) { + kvfree(buf_prevkey); + return -ENOMEM; + } + + err = -EFAULT; + first_key = false; + prev_key = NULL; + if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size)) + goto free_buf; + key = buf; + value = key + map->key_size; + if (ubatch) + prev_key = buf_prevkey; + + for (cp = 0; cp < max_count;) { + rcu_read_lock(); + err = map->ops->map_get_next_key(map, prev_key, key); + rcu_read_unlock(); + if (err) + break; + err = bpf_map_copy_value(map, key, value, + attr->batch.elem_flags); + + if (err == -ENOENT) { + if (retry) { + retry--; + continue; + } + err = -EINTR; + break; + } + + if (err) + goto free_buf; + + if (copy_to_user(keys + cp * map->key_size, key, + map->key_size)) { + err = -EFAULT; + goto free_buf; + } + if (copy_to_user(values + cp * value_size, value, value_size)) { + err = -EFAULT; + goto free_buf; + } + + if (!prev_key) + prev_key = buf_prevkey; + + swap(prev_key, key); + retry = MAP_LOOKUP_RETRIES; + cp++; + } + + if (err == -EFAULT) + goto free_buf; + + if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) || + (cp && copy_to_user(uobatch, prev_key, map->key_size)))) + err = -EFAULT; + +free_buf: + kfree(buf_prevkey); + kfree(buf); + return err; +} + #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value static int map_lookup_and_delete_elem(union bpf_attr *attr) @@ -3076,6 +3177,54 @@ out: return err; } +#define BPF_MAP_BATCH_LAST_FIELD batch.flags + +#define BPF_DO_BATCH(fn) \ + do { \ + if (!fn) { \ + err = -ENOTSUPP; \ + goto err_put; \ + } \ + err = fn(map, attr, uattr); \ + } while (0) + +static int bpf_map_do_batch(const union bpf_attr *attr, + union bpf_attr __user *uattr, + int cmd) +{ + struct bpf_map *map; + int err, ufd; + struct fd f; + + if (CHECK_ATTR(BPF_MAP_BATCH)) + return -EINVAL; + + ufd = attr->batch.map_fd; + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (cmd == BPF_MAP_LOOKUP_BATCH && + !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { + err = -EPERM; + goto err_put; + } + + if (cmd != BPF_MAP_LOOKUP_BATCH && + !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + + if (cmd == BPF_MAP_LOOKUP_BATCH) + BPF_DO_BATCH(map->ops->map_lookup_batch); + +err_put: + fdput(f); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -3173,6 +3322,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_LOOKUP_AND_DELETE_ELEM: err = map_lookup_and_delete_elem(&attr); break; + case BPF_MAP_LOOKUP_BATCH: + err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From aa2e93b8e58e18442edfb2427446732415bc215e Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Wed, 15 Jan 2020 10:43:02 -0800 Subject: bpf: Add generic support for update and delete batch ops This commit adds generic support for update and delete batch ops that can be used for almost all the bpf maps. These commands share the same UAPI attr that lookup and lookup_and_delete batch ops use and the syscall commands are: BPF_MAP_UPDATE_BATCH BPF_MAP_DELETE_BATCH The main difference between update/delete and lookup batch ops is that for update/delete keys/values must be specified for userspace and because of that, neither in_batch nor out_batch are used. Suggested-by: Stanislav Fomichev Signed-off-by: Brian Vazquez Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200115184308.162644-4-brianvv@google.com --- kernel/bpf/syscall.c | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d604ddbb1afb..ce8244d1ba99 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1218,6 +1218,111 @@ err_put: return err; } +int generic_map_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + void __user *keys = u64_to_user_ptr(attr->batch.keys); + u32 cp, max_count; + int err = 0; + void *key; + + if (attr->batch.elem_flags & ~BPF_F_LOCK) + return -EINVAL; + + if ((attr->batch.elem_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + return -EINVAL; + } + + max_count = attr->batch.count; + if (!max_count) + return 0; + + for (cp = 0; cp < max_count; cp++) { + key = __bpf_copy_key(keys + cp * map->key_size, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); + break; + } + + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_delete_elem(map, key); + break; + } + + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + rcu_read_lock(); + err = map->ops->map_delete_elem(map, key); + rcu_read_unlock(); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + maybe_wait_bpf_programs(map); + if (err) + break; + } + if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) + err = -EFAULT; + return err; +} + +int generic_map_update_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + void __user *values = u64_to_user_ptr(attr->batch.values); + void __user *keys = u64_to_user_ptr(attr->batch.keys); + u32 value_size, cp, max_count; + int ufd = attr->map_fd; + void *key, *value; + struct fd f; + int err = 0; + + f = fdget(ufd); + if (attr->batch.elem_flags & ~BPF_F_LOCK) + return -EINVAL; + + if ((attr->batch.elem_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + return -EINVAL; + } + + value_size = bpf_map_value_size(map); + + max_count = attr->batch.count; + if (!max_count) + return 0; + + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + if (!value) + return -ENOMEM; + + for (cp = 0; cp < max_count; cp++) { + key = __bpf_copy_key(keys + cp * map->key_size, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); + break; + } + err = -EFAULT; + if (copy_from_user(value, values + cp * value_size, value_size)) + break; + + err = bpf_map_update_value(map, f, key, value, + attr->batch.elem_flags); + + if (err) + break; + } + + if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) + err = -EFAULT; + + kfree(value); + kfree(key); + return err; +} + #define MAP_LOOKUP_RETRIES 3 int generic_map_lookup_batch(struct bpf_map *map, @@ -3219,6 +3324,10 @@ static int bpf_map_do_batch(const union bpf_attr *attr, if (cmd == BPF_MAP_LOOKUP_BATCH) BPF_DO_BATCH(map->ops->map_lookup_batch); + else if (cmd == BPF_MAP_UPDATE_BATCH) + BPF_DO_BATCH(map->ops->map_update_batch); + else + BPF_DO_BATCH(map->ops->map_delete_batch); err_put: fdput(f); @@ -3325,6 +3434,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_LOOKUP_BATCH: err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH); break; + case BPF_MAP_UPDATE_BATCH: + err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH); + break; + case BPF_MAP_DELETE_BATCH: + err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From c60f2d2861778de6370a4f4ca6ab1d7d4a32efae Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Wed, 15 Jan 2020 10:43:03 -0800 Subject: bpf: Add lookup and update batch ops to arraymap This adds the generic batch ops functionality to bpf arraymap, note that since deletion is not a valid operation for arraymap, only batch and lookup are added. Signed-off-by: Brian Vazquez Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200115184308.162644-5-brianvv@google.com --- kernel/bpf/arraymap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index f0d19bbb9211..95d77770353c 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -503,6 +503,8 @@ const struct bpf_map_ops array_map_ops = { .map_mmap = array_map_mmap, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, + .map_lookup_batch = generic_map_lookup_batch, + .map_update_batch = generic_map_update_batch, }; const struct bpf_map_ops percpu_array_map_ops = { -- cgit v1.2.3 From 057996380a42bb64ccc04383cfa9c0ace4ea11f0 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 15 Jan 2020 10:43:04 -0800 Subject: bpf: Add batch ops to all htab bpf map htab can't use generic batch support due some problematic behaviours inherent to the data structre, i.e. while iterating the bpf map a concurrent program might delete the next entry that batch was about to use, in that case there's no easy solution to retrieve the next entry, the issue has been discussed multiple times (see [1] and [2]). The only way hmap can be traversed without the problem previously exposed is by making sure that the map is traversing entire buckets. This commit implements those strict requirements for hmap, the implementation follows the same interaction that generic support with some exceptions: - If keys/values buffer are not big enough to traverse a bucket, ENOSPC will be returned. - out_batch contains the value of the next bucket in the iteration, not the next key, but this is transparent for the user since the user should never use out_batch for other than bpf batch syscalls. This commits implements BPF_MAP_LOOKUP_BATCH and adds support for new command BPF_MAP_LOOKUP_AND_DELETE_BATCH. Note that for update/delete batch ops it is possible to use the generic implementations. [1] https://lore.kernel.org/bpf/20190724165803.87470-1-brianvv@google.com/ [2] https://lore.kernel.org/bpf/20190906225434.3635421-1-yhs@fb.com/ Signed-off-by: Yonghong Song Signed-off-by: Brian Vazquez Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200115184308.162644-6-brianvv@google.com --- kernel/bpf/hashtab.c | 264 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 9 +- 2 files changed, 272 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 22066a62c8c9..2d182c4ee9d9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -17,6 +17,16 @@ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) +#define BATCH_OPS(_name) \ + .map_lookup_batch = \ + _name##_map_lookup_batch, \ + .map_lookup_and_delete_batch = \ + _name##_map_lookup_and_delete_batch, \ + .map_update_batch = \ + generic_map_update_batch, \ + .map_delete_batch = \ + generic_map_delete_batch + struct bucket { struct hlist_nulls_head head; raw_spinlock_t lock; @@ -1232,6 +1242,256 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static int +__htab_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr, + bool do_delete, bool is_lru_map, + bool is_percpu) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 bucket_cnt, total, key_size, value_size, roundup_key_size; + void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; + void __user *uvalues = u64_to_user_ptr(attr->batch.values); + void __user *ukeys = u64_to_user_ptr(attr->batch.keys); + void *ubatch = u64_to_user_ptr(attr->batch.in_batch); + u32 batch, max_count, size, bucket_size; + u64 elem_map_flags, map_flags; + struct hlist_nulls_head *head; + struct hlist_nulls_node *n; + unsigned long flags; + struct htab_elem *l; + struct bucket *b; + int ret = 0; + + elem_map_flags = attr->batch.elem_flags; + if ((elem_map_flags & ~BPF_F_LOCK) || + ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) + return -EINVAL; + + map_flags = attr->batch.flags; + if (map_flags) + return -EINVAL; + + max_count = attr->batch.count; + if (!max_count) + return 0; + + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + + batch = 0; + if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) + return -EFAULT; + + if (batch >= htab->n_buckets) + return -ENOENT; + + key_size = htab->map.key_size; + roundup_key_size = round_up(htab->map.key_size, 8); + value_size = htab->map.value_size; + size = round_up(value_size, 8); + if (is_percpu) + value_size = size * num_possible_cpus(); + total = 0; + /* while experimenting with hash tables with sizes ranging from 10 to + * 1000, it was observed that a bucket can have upto 5 entries. + */ + bucket_size = 5; + +alloc: + /* We cannot do copy_from_user or copy_to_user inside + * the rcu_read_lock. Allocate enough space here. + */ + keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN); + values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN); + if (!keys || !values) { + ret = -ENOMEM; + goto after_loop; + } + +again: + preempt_disable(); + this_cpu_inc(bpf_prog_active); + rcu_read_lock(); +again_nocopy: + dst_key = keys; + dst_val = values; + b = &htab->buckets[batch]; + head = &b->head; + raw_spin_lock_irqsave(&b->lock, flags); + + bucket_cnt = 0; + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) + bucket_cnt++; + + if (bucket_cnt > (max_count - total)) { + if (total == 0) + ret = -ENOSPC; + raw_spin_unlock_irqrestore(&b->lock, flags); + rcu_read_unlock(); + this_cpu_dec(bpf_prog_active); + preempt_enable(); + goto after_loop; + } + + if (bucket_cnt > bucket_size) { + bucket_size = bucket_cnt; + raw_spin_unlock_irqrestore(&b->lock, flags); + rcu_read_unlock(); + this_cpu_dec(bpf_prog_active); + preempt_enable(); + kvfree(keys); + kvfree(values); + goto alloc; + } + + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { + memcpy(dst_key, l->key, key_size); + + if (is_percpu) { + int off = 0, cpu; + void __percpu *pptr; + + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(dst_val + off, + per_cpu_ptr(pptr, cpu), size); + off += size; + } + } else { + value = l->key + roundup_key_size; + if (elem_map_flags & BPF_F_LOCK) + copy_map_value_locked(map, dst_val, value, + true); + else + copy_map_value(map, dst_val, value); + check_and_init_map_lock(map, dst_val); + } + if (do_delete) { + hlist_nulls_del_rcu(&l->hash_node); + if (is_lru_map) + bpf_lru_push_free(&htab->lru, &l->lru_node); + else + free_htab_elem(htab, l); + } + dst_key += key_size; + dst_val += value_size; + } + + raw_spin_unlock_irqrestore(&b->lock, flags); + /* If we are not copying data, we can go to next bucket and avoid + * unlocking the rcu. + */ + if (!bucket_cnt && (batch + 1 < htab->n_buckets)) { + batch++; + goto again_nocopy; + } + + rcu_read_unlock(); + this_cpu_dec(bpf_prog_active); + preempt_enable(); + if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, + key_size * bucket_cnt) || + copy_to_user(uvalues + total * value_size, values, + value_size * bucket_cnt))) { + ret = -EFAULT; + goto after_loop; + } + + total += bucket_cnt; + batch++; + if (batch >= htab->n_buckets) { + ret = -ENOENT; + goto after_loop; + } + goto again; + +after_loop: + if (ret == -EFAULT) + goto out; + + /* copy # of entries and next batch */ + ubatch = u64_to_user_ptr(attr->batch.out_batch); + if (copy_to_user(ubatch, &batch, sizeof(batch)) || + put_user(total, &uattr->batch.count)) + ret = -EFAULT; + +out: + kvfree(keys); + kvfree(values); + return ret; +} + +static int +htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + false, true); +} + +static int +htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + false, true); +} + +static int +htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + false, false); +} + +static int +htab_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + false, false); +} + +static int +htab_lru_percpu_map_lookup_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + true, true); +} + +static int +htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + true, true); +} + +static int +htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + true, false); +} + +static int +htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + true, false); +} + const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1242,6 +1502,7 @@ const struct bpf_map_ops htab_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + BATCH_OPS(htab), }; const struct bpf_map_ops htab_lru_map_ops = { @@ -1255,6 +1516,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + BATCH_OPS(htab_lru), }; /* Called from eBPF program */ @@ -1368,6 +1630,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + BATCH_OPS(htab_percpu), }; const struct bpf_map_ops htab_lru_percpu_map_ops = { @@ -1379,6 +1642,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + BATCH_OPS(htab_lru_percpu), }; static int fd_htab_map_alloc_check(union bpf_attr *attr) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ce8244d1ba99..0d94d361379c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3310,7 +3310,8 @@ static int bpf_map_do_batch(const union bpf_attr *attr, if (IS_ERR(map)) return PTR_ERR(map); - if (cmd == BPF_MAP_LOOKUP_BATCH && + if ((cmd == BPF_MAP_LOOKUP_BATCH || + cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; @@ -3324,6 +3325,8 @@ static int bpf_map_do_batch(const union bpf_attr *attr, if (cmd == BPF_MAP_LOOKUP_BATCH) BPF_DO_BATCH(map->ops->map_lookup_batch); + else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) + BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch); else if (cmd == BPF_MAP_UPDATE_BATCH) BPF_DO_BATCH(map->ops->map_update_batch); else @@ -3434,6 +3437,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_LOOKUP_BATCH: err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH); break; + case BPF_MAP_LOOKUP_AND_DELETE_BATCH: + err = bpf_map_do_batch(&attr, uattr, + BPF_MAP_LOOKUP_AND_DELETE_BATCH); + break; case BPF_MAP_UPDATE_BATCH: err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH); break; -- cgit v1.2.3 From d129479f1fff5c88adbf8dff7649664916d28f81 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 18 Dec 2019 05:31:25 +0000 Subject: watchdog: Remove soft_lockup_hrtimer_cnt and related code After commit 9cf57731b63e ("watchdog/softlockup: Replace "watchdog/%u" threads with cpu_stop_work"), the percpu soft_lockup_hrtimer_cnt is not used any more, so remove it and related code. Signed-off-by: Jisheng Zhang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191218131720.4146aea2@xhacker.debian --- kernel/watchdog.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f41334ef0971..0621301ae8cf 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -173,7 +173,6 @@ static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); -static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; @@ -350,8 +349,6 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); */ static int softlockup_fn(void *data) { - __this_cpu_write(soft_lockup_hrtimer_cnt, - __this_cpu_read(hrtimer_interrupts)); __touch_watchdog(); complete(this_cpu_ptr(&softlockup_completion)); -- cgit v1.2.3 From 82d1b8158c9a77c2c9b04c4af22fd62f3686cd9d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 16 Jan 2020 08:20:18 -0500 Subject: tracing: Allow trace_printk() to nest in other tracing code trace_printk() is used to debug the kernel which includes the tracing infrastructure. But because it writes to the ring buffer, and so does much of the tracing infrastructure, the ring buffer's recursive detection will drop writes to the ring buffer that is in the same context as the current write is happening (it allows interrupts to write when normal context is writing, but wont let normal context write while normal context is writing). This can cause confusion and think that the code is where the trace_printk() exists is not hit. To solve this, up the recursive nesting of the ring buffer when trace_printk() is called before it writes to the buffer itself. Note, this does make it dangerous to use trace_printk() in the ring buffer code itself, because this basically disables the recursion protection of trace_printk() buffer writes. But as trace_printk() is only used for debugging, and if this does occur, the developer will see the cause real quick (recursive blowing up of the stack). Thus the developer can deal with that. But having trace_printk() silently ignored is a much bigger problem, and disabling recursive protection is a small price to pay to fix it. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 106bbc0988fe..2e1db19dce97 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -866,10 +866,13 @@ int __trace_puts(unsigned long ip, const char *str, int size) local_save_flags(irq_flags); buffer = global_trace.array_buffer.buffer; + ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, irq_flags, pc); - if (!event) - return 0; + if (!event) { + size = 0; + goto out; + } entry = ring_buffer_event_data(event); entry->ip = ip; @@ -885,7 +888,8 @@ int __trace_puts(unsigned long ip, const char *str, int size) __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); - + out: + ring_buffer_nest_end(buffer); return size; } EXPORT_SYMBOL_GPL(__trace_puts); @@ -902,6 +906,7 @@ int __trace_bputs(unsigned long ip, const char *str) struct bputs_entry *entry; unsigned long irq_flags; int size = sizeof(struct bputs_entry); + int ret = 0; int pc; if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) @@ -914,10 +919,12 @@ int __trace_bputs(unsigned long ip, const char *str) local_save_flags(irq_flags); buffer = global_trace.array_buffer.buffer; + + ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, irq_flags, pc); if (!event) - return 0; + goto out; entry = ring_buffer_event_data(event); entry->ip = ip; @@ -926,7 +933,10 @@ int __trace_bputs(unsigned long ip, const char *str) __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); - return 1; + ret = 1; + out: + ring_buffer_nest_end(buffer); + return ret; } EXPORT_SYMBOL_GPL(__trace_bputs); @@ -3225,6 +3235,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) local_save_flags(flags); size = sizeof(*entry) + sizeof(u32) * len; buffer = tr->array_buffer.buffer; + ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, flags, pc); if (!event) @@ -3240,6 +3251,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) } out: + ring_buffer_nest_end(buffer); put_trace_buf(); out_nobuffer: @@ -3282,6 +3294,7 @@ __trace_array_vprintk(struct trace_buffer *buffer, local_save_flags(flags); size = sizeof(*entry) + len + 1; + ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, flags, pc); if (!event) @@ -3296,6 +3309,7 @@ __trace_array_vprintk(struct trace_buffer *buffer, } out: + ring_buffer_nest_end(buffer); put_trace_buf(); out_nobuffer: -- cgit v1.2.3 From 3a51449b7959f68cc45abe67298e40c7eb57167b Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 24 Oct 2019 13:49:26 +0200 Subject: watchdog/softlockup: Remove obsolete check of last reported task commit 9cf57731b63e ("watchdog/softlockup: Replace "watchdog/%u" threads with cpu_stop_work") ensures that the watchdog is reliably touched during a task switch. As a result the check for an unnoticed task switch is not longer needed. Remove the relevant code, which effectively reverts commit b1a8de1f5343 ("softlockup: make detector be aware of task switch of processes hogging cpu") Signed-off-by: Petr Mladek Signed-off-by: Thomas Gleixner Acked-by: Peter Ziljstra Link: https://lore.kernel.org/r/20191024114928.15377-2-pmladek@suse.com --- kernel/watchdog.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 0621301ae8cf..e3774e9625b7 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -173,7 +173,6 @@ static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); -static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; @@ -413,22 +412,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; /* only warn once */ - if (__this_cpu_read(soft_watchdog_warn) == true) { - /* - * When multiple processes are causing softlockups the - * softlockup detector only warns on the first one - * because the code relies on a full quiet cycle to - * re-arm. The second process prevents the quiet cycle - * and never gets reported. Use task pointers to detect - * this. - */ - if (__this_cpu_read(softlockup_task_ptr_saved) != - current) { - __this_cpu_write(soft_watchdog_warn, false); - __touch_watchdog(); - } + if (__this_cpu_read(soft_watchdog_warn) == true) return HRTIMER_RESTART; - } if (softlockup_all_cpu_backtrace) { /* Prevent multiple soft-lockup reports if one cpu is already @@ -444,7 +429,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); - __this_cpu_write(softlockup_task_ptr_saved, current); print_modules(); print_irqtrace_events(current); if (regs) -- cgit v1.2.3 From c052bf82c6b00ca27aab0859addc4b3159dfd3a4 Mon Sep 17 00:00:00 2001 From: Jonas Meurer Date: Thu, 16 Jan 2020 12:53:54 +0100 Subject: PM: suspend: Add sysfs attribute to control the "sync on suspend" behavior The sysfs attribute `/sys/power/sync_on_suspend` controls, whether or not filesystems are synced by the kernel before system suspend. Congruously, the behaviour of build-time switch CONFIG_SUSPEND_SKIP_SYNC is slightly changed: It now defines the run-tim default for the new sysfs attribute `/sys/power/sync_on_suspend`. The run-time attribute is added because the existing corresponding build-time Kconfig flag for (`CONFIG_SUSPEND_SKIP_SYNC`) is not flexible enough. E.g. Linux distributions that provide pre-compiled kernels usually want to stick with the default (sync filesystems before suspend) but under special conditions this needs to be changed. One example for such a special condition is user-space handling of suspending block devices (e.g. using `cryptsetup luksSuspend` or `dmsetup suspend`) before system suspend. The Kernel trying to sync filesystems after the underlying block device already got suspended obviously leads to dead-locks. Be aware that you have to take care of the filesystem sync yourself before suspending the system in those scenarios. Signed-off-by: Jonas Meurer Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 5 ++++- kernel/power/main.c | 33 +++++++++++++++++++++++++++++++++ kernel/power/suspend.c | 2 +- 3 files changed, 38 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index d3667b4075c1..7cbfbeacd68a 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -27,7 +27,10 @@ config SUSPEND_SKIP_SYNC Skip the kernel sys_sync() before freezing user processes. Some systems prefer not to pay this cost on every invocation of suspend, or they are content with invoking sync() from - user-space before invoking suspend. Say Y if that's your case. + user-space before invoking suspend. There's a run-time switch + at '/sys/power/sync_on_suspend' to configure this behaviour. + This setting changes the default for the run-tim switch. Say Y + to change the default to disable the kernel sys_sync(). config HIBERNATE_CALLBACKS bool diff --git a/kernel/power/main.c b/kernel/power/main.c index e26de7af520b..69b7a8aeca3b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -190,6 +190,38 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr } power_attr(mem_sleep); + +/* + * sync_on_suspend: invoke ksys_sync_helper() before suspend. + * + * show() returns whether ksys_sync_helper() is invoked before suspend. + * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it. + */ +bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC); + +static ssize_t sync_on_suspend_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", sync_on_suspend_enabled); +} + +static ssize_t sync_on_suspend_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + sync_on_suspend_enabled = !!val; + return n; +} + +power_attr(sync_on_suspend); #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_PM_SLEEP_DEBUG @@ -855,6 +887,7 @@ static struct attribute * g[] = { &wakeup_count_attr.attr, #ifdef CONFIG_SUSPEND &mem_sleep_attr.attr, + &sync_on_suspend_attr.attr, #endif #ifdef CONFIG_PM_AUTOSLEEP &autosleep_attr.attr, diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f3b7239f1892..2c47280fbfc7 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -564,7 +564,7 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_TO_IDLE) s2idle_begin(); - if (!IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC)) { + if (sync_on_suspend_enabled) { trace_suspend_resume(TPS("sync_filesystems"), 0, true); ksys_sync_helper(); trace_suspend_resume(TPS("sync_filesystems"), 0, false); -- cgit v1.2.3 From 18451f9f9e5810b8bd1245c5ae166f257e0e2b9d Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 16 Jan 2020 12:09:34 +0100 Subject: PM: hibernate: fix crashes with init_on_free=1 Upon resuming from hibernation, free pages may contain stale data from the kernel that initiated the resume. This breaks the invariant inflicted by init_on_free=1 that freed pages must be zeroed. To deal with this problem, make clear_free_pages() also clear the free pages when init_on_free is enabled. Fixes: 6471384af2a6 ("mm: security: introduce init_on_alloc=1 and init_on_free=1 boot options") Reported-by: Johannes Stezenbach Signed-off-by: Alexander Potapenko Cc: 5.3+ # 5.3+ Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 26b9168321e7..d65f2d5ab694 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1147,24 +1147,24 @@ void free_basic_memory_bitmaps(void) void clear_free_pages(void) { -#ifdef CONFIG_PAGE_POISONING_ZERO struct memory_bitmap *bm = free_pages_map; unsigned long pfn; if (WARN_ON(!(free_pages_map))) return; - memory_bm_position_reset(bm); - pfn = memory_bm_next_pfn(bm); - while (pfn != BM_END_OF_MAP) { - if (pfn_valid(pfn)) - clear_highpage(pfn_to_page(pfn)); - + if (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) || want_init_on_free()) { + memory_bm_position_reset(bm); pfn = memory_bm_next_pfn(bm); + while (pfn != BM_END_OF_MAP) { + if (pfn_valid(pfn)) + clear_highpage(pfn_to_page(pfn)); + + pfn = memory_bm_next_pfn(bm); + } + memory_bm_position_reset(bm); + pr_info("free pages cleared after restore\n"); } - memory_bm_position_reset(bm); - pr_info("free pages cleared after restore\n"); -#endif /* PAGE_POISONING_ZERO */ } /** -- cgit v1.2.3 From 75ccae62cb8d42a619323a85c577107b8b37d797 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 16 Jan 2020 16:14:44 +0100 Subject: xdp: Move devmap bulk queue into struct net_device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 96360004b862 ("xdp: Make devmap flush_list common for all map instances"), changed devmap flushing to be a global operation instead of a per-map operation. However, the queue structure used for bulking was still allocated as part of the containing map. This patch moves the devmap bulk queue into struct net_device. The motivation for this is reusing it for the non-map variant of XDP_REDIRECT, which will be changed in a subsequent commit. To avoid other fields of struct net_device moving to different cache lines, we also move a couple of other members around. We defer the actual allocation of the bulk queue structure until the NETDEV_REGISTER notification devmap.c. This makes it possible to check for ndo_xdp_xmit support before allocating the structure, which is not possible at the time struct net_device is allocated. However, we keep the freeing in free_netdev() to avoid adding another RCU callback on NETDEV_UNREGISTER. Because of this change, we lose the reference back to the map that originated the redirect, so change the tracepoint to always return 0 as the map ID and index. Otherwise no functional change is intended with this patch. After this patch, the relevant part of struct net_device looks like this, according to pahole: /* --- cacheline 14 boundary (896 bytes) --- */ struct netdev_queue * _tx __attribute__((__aligned__(64))); /* 896 8 */ unsigned int num_tx_queues; /* 904 4 */ unsigned int real_num_tx_queues; /* 908 4 */ struct Qdisc * qdisc; /* 912 8 */ unsigned int tx_queue_len; /* 920 4 */ spinlock_t tx_global_lock; /* 924 4 */ struct xdp_dev_bulk_queue * xdp_bulkq; /* 928 8 */ struct xps_dev_maps * xps_cpus_map; /* 936 8 */ struct xps_dev_maps * xps_rxqs_map; /* 944 8 */ struct mini_Qdisc * miniq_egress; /* 952 8 */ /* --- cacheline 15 boundary (960 bytes) --- */ struct hlist_head qdisc_hash[16]; /* 960 128 */ /* --- cacheline 17 boundary (1088 bytes) --- */ struct timer_list watchdog_timer; /* 1088 40 */ /* XXX last struct has 4 bytes of padding */ int watchdog_timeo; /* 1128 4 */ /* XXX 4 bytes hole, try to pack */ struct list_head todo_list; /* 1136 16 */ /* --- cacheline 18 boundary (1152 bytes) --- */ Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Björn Töpel Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/157918768397.1458396.12673224324627072349.stgit@toke.dk --- kernel/bpf/devmap.c | 63 +++++++++++++++++++++++------------------------------ 1 file changed, 27 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index da9c832fc5c8..030d125c3839 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -53,13 +53,11 @@ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) #define DEV_MAP_BULK_SIZE 16 -struct bpf_dtab_netdev; - -struct xdp_bulk_queue { +struct xdp_dev_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct list_head flush_node; + struct net_device *dev; struct net_device *dev_rx; - struct bpf_dtab_netdev *obj; unsigned int count; }; @@ -67,9 +65,8 @@ struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; - struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; - unsigned int idx; /* keep track of map index for tracepoint */ + unsigned int idx; }; struct bpf_dtab { @@ -219,7 +216,6 @@ static void dev_map_free(struct bpf_map *map) hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); - free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -234,7 +230,6 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; - free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -320,10 +315,9 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } -static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) +static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { - struct bpf_dtab_netdev *obj = bq->obj; - struct net_device *dev = obj->dev; + struct net_device *dev = bq->dev; int sent = 0, drops = 0, err = 0; int i; @@ -346,8 +340,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) out: bq->count = 0; - trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx, - sent, drops, bq->dev_rx, dev, err); + trace_xdp_devmap_xmit(NULL, 0, sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; __list_del_clearprev(&bq->flush_node); return 0; @@ -374,7 +367,7 @@ error: void __dev_map_flush(void) { struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); - struct xdp_bulk_queue *bq, *tmp; + struct xdp_dev_bulk_queue *bq, *tmp; rcu_read_lock(); list_for_each_entry_safe(bq, tmp, flush_list, flush_node) @@ -401,12 +394,12 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, +static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); - struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); + struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) bq_xmit_all(bq, 0); @@ -444,7 +437,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, if (unlikely(!xdpf)) return -EOVERFLOW; - return bq_enqueue(dst, xdpf, dev_rx); + return bq_enqueue(dev, xdpf, dev_rx); } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, @@ -483,7 +476,6 @@ static void __dev_map_entry_free(struct rcu_head *rcu) struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); - free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -538,30 +530,15 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, u32 ifindex, unsigned int idx) { - gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev; - struct xdp_bulk_queue *bq; - int cpu; - dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node); + dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, + dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); - dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), - sizeof(void *), gfp); - if (!dev->bulkq) { - kfree(dev); - return ERR_PTR(-ENOMEM); - } - - for_each_possible_cpu(cpu) { - bq = per_cpu_ptr(dev->bulkq, cpu); - bq->obj = dev; - } - dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { - free_percpu(dev->bulkq); kfree(dev); return ERR_PTR(-EINVAL); } @@ -721,9 +698,23 @@ static int dev_map_notification(struct notifier_block *notifier, { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct bpf_dtab *dtab; - int i; + int i, cpu; switch (event) { + case NETDEV_REGISTER: + if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq) + break; + + /* will be freed in free_netdev() */ + netdev->xdp_bulkq = + __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue), + sizeof(void *), GFP_ATOMIC); + if (!netdev->xdp_bulkq) + return NOTIFY_BAD; + + for_each_possible_cpu(cpu) + per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev; + break; case NETDEV_UNREGISTER: /* This rcu_read_lock/unlock pair is needed because * dev_map_list is an RCU list AND to ensure a delete -- cgit v1.2.3 From 1d233886dd904edbf239eeffe435c3308ae97625 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 16 Jan 2020 16:14:45 +0100 Subject: xdp: Use bulking for non-map XDP_REDIRECT and consolidate code paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the bulk queue used by XDP_REDIRECT now lives in struct net_device, we can re-use the bulking for the non-map version of the bpf_redirect() helper. This is a simple matter of having xdp_do_redirect_slow() queue the frame on the bulk queue instead of sending it out with __bpf_tx_xdp(). Unfortunately we can't make the bpf_redirect() helper return an error if the ifindex doesn't exit (as bpf_redirect_map() does), because we don't have a reference to the network namespace of the ingress device at the time the helper is called. So we have to leave it as-is and keep the device lookup in xdp_do_redirect_slow(). Since this leaves less reason to have the non-map redirect code in a separate function, so we get rid of the xdp_do_redirect_slow() function entirely. This does lose us the tracepoint disambiguation, but fortunately the xdp_redirect and xdp_redirect_map tracepoints use the same tracepoint entry structures. This means both can contain a map index, so we can just amend the tracepoint definitions so we always emit the xdp_redirect(_err) tracepoints, but with the map ID only populated if a map is present. This means we retire the xdp_redirect_map(_err) tracepoints entirely, but keep the definitions around in case someone is still listening for them. With this change, the performance of the xdp_redirect sample program goes from 5Mpps to 8.4Mpps (a 68% increase). Since the flush functions are no longer map-specific, rename the flush() functions to drop _map from their names. One of the renamed functions is the xdp_do_flush_map() callback used in all the xdp-enabled drivers. To keep from having to update all drivers, use a #define to keep the old name working, and only update the virtual drivers in this patch. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/157918768505.1458396.17518057312953572912.stgit@toke.dk --- kernel/bpf/devmap.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 030d125c3839..d5311009953f 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -81,7 +81,7 @@ struct bpf_dtab { u32 n_buckets; }; -static DEFINE_PER_CPU(struct list_head, dev_map_flush_list); +static DEFINE_PER_CPU(struct list_head, dev_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); @@ -357,16 +357,16 @@ error: goto out; } -/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled +/* __dev_flush is called from xdp_do_flush() which _must_ be signaled * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the * net device can be torn down. On devmap tear down we ensure the flush list * is empty before completing to ensure all flush operations have completed. */ -void __dev_map_flush(void) +void __dev_flush(void) { - struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq, *tmp; rcu_read_lock(); @@ -396,9 +396,8 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) */ static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) - { - struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) @@ -419,10 +418,9 @@ static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, return 0; } -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, - struct net_device *dev_rx) +static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, + struct net_device *dev_rx) { - struct net_device *dev = dst->dev; struct xdp_frame *xdpf; int err; @@ -440,6 +438,20 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, return bq_enqueue(dev, xdpf, dev_rx); } +int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + return __xdp_enqueue(dev, xdp, dev_rx); +} + +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + struct net_device *dev = dst->dev; + + return __xdp_enqueue(dev, xdp, dev_rx); +} + int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog) { @@ -762,7 +774,7 @@ static int __init dev_map_init(void) register_netdevice_notifier(&dev_map_notifier); for_each_possible_cpu(cpu) - INIT_LIST_HEAD(&per_cpu(dev_map_flush_list, cpu)); + INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu)); return 0; } -- cgit v1.2.3 From 58aa94f922c1b44e0919d1814d2eab5b9e8bf945 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 16 Jan 2020 16:14:46 +0100 Subject: devmap: Adjust tracepoint for map-less queue flush MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we don't have a reference to a devmap when flushing the device bulk queue, let's change the the devmap_xmit tracepoint to remote the map_id and map_index fields entirely. Rearrange the fields so 'drops' and 'sent' stay in the same position in the tracepoint struct, to make it possible for the xdp_monitor utility to read both the old and the new format. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/157918768613.1458396.9165902403373826572.stgit@toke.dk --- kernel/bpf/devmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d5311009953f..de630f980282 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -340,7 +340,7 @@ static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) out: bq->count = 0; - trace_xdp_devmap_xmit(NULL, 0, sent, drops, bq->dev_rx, dev, err); + trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); bq->dev_rx = NULL; __list_del_clearprev(&bq->flush_node); return 0; -- cgit v1.2.3 From 81f2b572cf4fd5c4178fe0e2b5bb1ab096385fd8 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 16 Jan 2020 22:53:00 +0800 Subject: bpf: Remove set but not used variable 'first_key' kernel/bpf/syscall.c: In function generic_map_lookup_batch: kernel/bpf/syscall.c:1339:7: warning: variable first_key set but not used [-Wunused-but-set-variable] It is never used, so remove it. Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Alexei Starovoitov Acked-by: Brian Vazquez Link: https://lore.kernel.org/bpf/20200116145300.59056-1-yuehaibing@huawei.com --- kernel/bpf/syscall.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0d94d361379c..c26a71460f02 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1336,7 +1336,6 @@ int generic_map_lookup_batch(struct bpf_map *map, void *buf, *buf_prevkey, *prev_key, *key, *value; int err, retry = MAP_LOOKUP_RETRIES; u32 value_size, cp, max_count; - bool first_key = false; if (attr->batch.elem_flags & ~BPF_F_LOCK) return -EINVAL; @@ -1365,7 +1364,6 @@ int generic_map_lookup_batch(struct bpf_map *map, } err = -EFAULT; - first_key = false; prev_key = NULL; if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size)) goto free_buf; -- cgit v1.2.3 From 31537cf8f3f95d45360b995ad8be2c870edc5b02 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 8 Jan 2020 08:57:55 -0500 Subject: tracing: Initialize ret in syscall_enter_define_fields() If syscall_enter_define_fields() is called on a system call with no arguments, the return code variable "ret" will never get initialized. Initialize it to zero. Fixes: 04ae87a52074e ("ftrace: Rework event_create_dir()") Reported-by: Qian Cai Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/0FA8C6E3-D9F5-416D-A1B0-5E4CD583A101@lca.pw --- kernel/trace/trace_syscalls.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 73140d80dd46..2978c29d87d4 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -274,7 +274,8 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; int offset = offsetof(typeof(trace), args); - int ret, i; + int ret = 0; + int i; for (i = 0; i < meta->nb_args; i++) { ret = trace_define_field(call, meta->types[i], -- cgit v1.2.3 From db5793c5993d265fe6644b6638fcb0758f6b5347 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 18 Dec 2019 05:31:25 +0000 Subject: watchdog: Remove soft_lockup_hrtimer_cnt and related code After commit 9cf57731b63e ("watchdog/softlockup: Replace "watchdog/%u" threads with cpu_stop_work"), the percpu soft_lockup_hrtimer_cnt is not used any more, so remove it and related code. Signed-off-by: Jisheng Zhang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191218131720.4146aea2@xhacker.debian --- kernel/watchdog.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f41334ef0971..0621301ae8cf 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -173,7 +173,6 @@ static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); -static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; @@ -350,8 +349,6 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); */ static int softlockup_fn(void *data) { - __this_cpu_write(soft_lockup_hrtimer_cnt, - __this_cpu_read(hrtimer_interrupts)); __touch_watchdog(); complete(this_cpu_ptr(&softlockup_completion)); -- cgit v1.2.3 From 5f68eb19b5716f8cf3ccfa833cffd1522813b0e8 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 20 Dec 2019 12:04:53 +0100 Subject: sched/fair : Improve update_sd_pick_busiest for spare capacity case Similarly to calculate_imbalance() and find_busiest_group(), using the number of idle CPUs when there is only 1 CPU in the group is not efficient because we can't make a difference between a CPU running 1 task and a CPU running dozens of small tasks competing for the same CPU but not enough to overload it. More generally speaking, we should use the number of running tasks when there is the same number of idle CPUs in a group instead of blindly select the 1st one. When the groups have spare capacity and the same number of idle CPUs, we compare the number of running tasks to select the busiest group. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1576839893-26930-1-git-send-email-vincent.guittot@linaro.org --- kernel/sched/fair.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2d170b5da0e3..35c105759dfa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8181,14 +8181,18 @@ static bool update_sd_pick_busiest(struct lb_env *env, case group_has_spare: /* - * Select not overloaded group with lowest number of - * idle cpus. We could also compare the spare capacity - * which is more stable but it can end up that the - * group has less spare capacity but finally more idle + * Select not overloaded group with lowest number of idle cpus + * and highest number of running tasks. We could also compare + * the spare capacity which is more stable but it can end up + * that the group has less spare capacity but finally more idle * CPUs which means less opportunity to pull tasks. */ - if (sgs->idle_cpus >= busiest->idle_cpus) + if (sgs->idle_cpus > busiest->idle_cpus) return false; + else if ((sgs->idle_cpus == busiest->idle_cpus) && + (sgs->sum_nr_running <= busiest->sum_nr_running)) + return false; + break; } -- cgit v1.2.3 From 323af6deaf70f204880caf94678350802682e0dc Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 8 Jan 2020 13:57:04 +0530 Subject: sched/fair: Load balance aggressively for SCHED_IDLE CPUs The fair scheduler performs periodic load balance on every CPU to check if it can pull some tasks from other busy CPUs. The duration of this periodic load balance is set to sd->balance_interval for the idle CPUs and is calculated by multiplying the sd->balance_interval with the sd->busy_factor (set to 32 by default) for the busy CPUs. The multiplication is done for busy CPUs to avoid doing load balance too often and rather spend more time executing actual task. While that is the right thing to do for the CPUs busy with SCHED_OTHER or SCHED_BATCH tasks, it may not be the optimal thing for CPUs running only SCHED_IDLE tasks. With the recent enhancements in the fair scheduler around SCHED_IDLE CPUs, we now prefer to enqueue a newly-woken task to a SCHED_IDLE CPU instead of other busy or idle CPUs. The same reasoning should be applied to the load balancer as well to make it migrate tasks more aggressively to a SCHED_IDLE CPU, as that will reduce the scheduling latency of the migrated (SCHED_OTHER) tasks. This patch makes minimal changes to the fair scheduler to do the next load balance soon after the last non SCHED_IDLE task is dequeued from a runqueue, i.e. making the CPU SCHED_IDLE. Also the sd->busy_factor is ignored while calculating the balance_interval for such CPUs. This is done to avoid delaying the periodic load balance by few hundred milliseconds for SCHED_IDLE CPUs. This is tested on ARM64 Hikey620 platform (octa-core) with the help of rt-app and it is verified, using kernel traces, that the newly SCHED_IDLE CPU does load balancing shortly after it becomes SCHED_IDLE and pulls tasks from other busy CPUs. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/e485827eb8fe7db0943d6f3f6e0f5a4a70272781.1578471925.git.viresh.kumar@linaro.org --- kernel/sched/fair.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 35c105759dfa..d292883694b7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5210,6 +5210,18 @@ static inline void update_overutilized_status(struct rq *rq) static inline void update_overutilized_status(struct rq *rq) { } #endif +/* Runqueue only has SCHED_IDLE tasks enqueued */ +static int sched_idle_rq(struct rq *rq) +{ + return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && + rq->nr_running); +} + +static int sched_idle_cpu(int cpu) +{ + return sched_idle_rq(cpu_rq(cpu)); +} + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -5324,6 +5336,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p); + bool was_sched_idle = sched_idle_rq(rq); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -5370,6 +5383,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) sub_nr_running(rq, 1); + /* balance early to pull high priority tasks */ + if (unlikely(!was_sched_idle && sched_idle_rq(rq))) + rq->next_balance = jiffies; + util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -5392,15 +5409,6 @@ static struct { #endif /* CONFIG_NO_HZ_COMMON */ -/* CPU only has SCHED_IDLE tasks enqueued */ -static int sched_idle_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && - rq->nr_running); -} - static unsigned long cpu_load(struct rq *rq) { return cfs_rq_load_avg(&rq->cfs); @@ -9546,6 +9554,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; + int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ @@ -9582,7 +9591,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) break; } - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + interval = get_sd_balance_interval(sd, busy); need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { @@ -9598,9 +9607,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) * state even if we migrated tasks. Update it. */ idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; + busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); } sd->last_balance = jiffies; - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + interval = get_sd_balance_interval(sd, busy); } if (need_serialize) spin_unlock(&balancing); -- cgit v1.2.3 From 7226017ad37a888915628e59a84a2d1e57b40707 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 24 Dec 2019 11:54:04 +0000 Subject: sched/uclamp: Fix a bug in propagating uclamp value in new cgroups When a new cgroup is created, the effective uclamp value wasn't updated with a call to cpu_util_update_eff() that looks at the hierarchy and update to the most restrictive values. Fix it by ensuring to call cpu_util_update_eff() when a new cgroup becomes online. Without this change, the newly created cgroup uses the default root_task_group uclamp values, which is 1024 for both uclamp_{min, max}, which will cause the rq to to be clamped to max, hence cause the system to run at max frequency. The problem was observed on Ubuntu server and was reproduced on Debian and Buildroot rootfs. By default, Ubuntu and Debian create a cpu controller cgroup hierarchy and add all tasks to it - which creates enough noise to keep the rq uclamp value at max most of the time. Imitating this behavior makes the problem visible in Buildroot too which otherwise looks fine since it's a minimal userspace. Fixes: 0b60ba2dd342 ("sched/uclamp: Propagate parent clamps") Reported-by: Doug Smythies Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Tested-by: Doug Smythies Link: https://lore.kernel.org/lkml/000701d5b965$361b6c60$a2524520$@net/ --- kernel/sched/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e7b08d52db93..d0270b14c132 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7099,6 +7099,12 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) if (parent) sched_online_group(tg, parent); + +#ifdef CONFIG_UCLAMP_TASK_GROUP + /* Propagate the effective uclamp value for the new group */ + cpu_util_update_eff(css); +#endif + return 0; } -- cgit v1.2.3 From dcd6dffb0a75741471297724640733fa4e958d72 Mon Sep 17 00:00:00 2001 From: Li Guanglei Date: Wed, 25 Dec 2019 15:44:04 +0800 Subject: sched/core: Fix size of rq::uclamp initialization rq::uclamp is an array of struct uclamp_rq, make sure we clear the whole thing. Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcountinga") Signed-off-by: Li Guanglei Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Qais Yousef Link: https://lkml.kernel.org/r/1577259844-12677-1-git-send-email-guangleix.li@gmail.com --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d0270b14c132..fc1dfc007604 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1253,7 +1253,8 @@ static void __init init_uclamp(void) mutex_init(&uclamp_mutex); for_each_possible_cpu(cpu) { - memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); + memset(&cpu_rq(cpu)->uclamp, 0, + sizeof(struct uclamp_rq)*UCLAMP_CNT); cpu_rq(cpu)->uclamp_flags = 0; } -- cgit v1.2.3 From 02d4ac5885a18d326b500b94808f0956dcce2832 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Thu, 26 Dec 2019 16:52:24 +0800 Subject: sched/debug: Reset watchdog on all CPUs while processing sysrq-t Lengthy output of sysrq-t may take a lot of time on slow serial console with lots of processes and CPUs. So we need to reset NMI-watchdog to avoid spurious lockup messages, and we also reset softlockup watchdogs on all other CPUs since another CPU might be blocked waiting for us to process an IPI or stop_machine. Add to sysrq_sched_debug_show() as what we did in show_state_filter(). Signed-off-by: Wei Li Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/20191226085224.48942-1-liwei391@huawei.com --- kernel/sched/debug.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index f7e4579e746c..879d3ccf3806 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -751,9 +751,16 @@ void sysrq_sched_debug_show(void) int cpu; sched_debug_header(NULL); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + /* + * Need to reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI or stop_machine. + */ + touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); print_cpu(NULL, cpu); - + } } /* -- cgit v1.2.3 From 35f4cd96f5551dc1b2641159e7bb7bf91de6600f Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 28 Dec 2019 16:19:12 +0000 Subject: stop_machine: Make stop_cpus() static The function stop_cpus() is only used internally by the stop_machine for stop multiple cpus. Make it static. Signed-off-by: Yangtao Li Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191228161912.24082-1-tiny.windzz@gmail.com --- kernel/stop_machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 5d68ec4c4015..865bb0228ab6 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -442,7 +442,7 @@ static int __stop_cpus(const struct cpumask *cpumask, * @cpumask were offline; otherwise, 0 if all executions of @fn * returned 0, any non zero return value if any returned non zero. */ -int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +static int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) { int ret; -- cgit v1.2.3 From 9dec1b6949ae9509cdc3edb2d75fda39c9db9fa2 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 2 Jan 2020 18:07:52 +0800 Subject: sched/cputime: move rq parameter in irqtime_account_process_tick Every time we call irqtime_account_process_tick() is in a interrupt, Every caller will get and assign a parameter rq = this_rq(), This is unnecessary and increase the code size a little bit. Move the rq getting action to irqtime_account_process_tick internally is better. base with this patch cputime.o 578792 bytes 577888 bytes Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1577959674-255537-1-git-send-email-alex.shi@linux.alibaba.com --- kernel/sched/cputime.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index d43318a489f2..cff3e656566d 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -355,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) * softirq as those do not count in task exec_runtime any more. */ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq, int ticks) + int ticks) { u64 other, cputime = TICK_NSEC * ticks; @@ -381,7 +381,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); } else if (user_tick) { account_user_time(p, cputime); - } else if (p == rq->idle) { + } else if (p == this_rq()->idle) { account_idle_time(cputime); } else if (p->flags & PF_VCPU) { /* System time or guest time */ account_guest_time(p, cputime); @@ -392,14 +392,12 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, static void irqtime_account_idle_ticks(int ticks) { - struct rq *rq = this_rq(); - - irqtime_account_process_tick(current, 0, rq, ticks); + irqtime_account_process_tick(current, 0, ticks); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ static inline void irqtime_account_idle_ticks(int ticks) { } static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq, int nr_ticks) { } + int nr_ticks) { } #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* @@ -473,13 +471,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) void account_process_tick(struct task_struct *p, int user_tick) { u64 cputime, steal; - struct rq *rq = this_rq(); if (vtime_accounting_enabled_this_cpu()) return; if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq, 1); + irqtime_account_process_tick(p, user_tick, 1); return; } @@ -493,7 +490,7 @@ void account_process_tick(struct task_struct *p, int user_tick) if (user_tick) account_user_time(p, cputime); - else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) + else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET)) account_system_time(p, HARDIRQ_OFFSET, cputime); else account_idle_time(cputime); -- cgit v1.2.3 From fe71bbb21ee14160f73f81b113d71145327a1c0d Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Fri, 3 Jan 2020 19:44:00 +0800 Subject: sched/fair: calculate delta runnable load only when it's needed Move the code of calculation for delta_sum/delta_avg to where it is really needed to be done. Signed-off-by: Peng Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20200103114400.17668-1-rocking@linux.alibaba.com --- kernel/sched/fair.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d292883694b7..32c5421b6a25 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3366,16 +3366,17 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf runnable_load_sum = (s64)se_runnable(se) * runnable_sum; runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); - delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum; - delta_avg = runnable_load_avg - se->avg.runnable_load_avg; - - se->avg.runnable_load_sum = runnable_sum; - se->avg.runnable_load_avg = runnable_load_avg; if (se->on_rq) { + delta_sum = runnable_load_sum - + se_weight(se) * se->avg.runnable_load_sum; + delta_avg = runnable_load_avg - se->avg.runnable_load_avg; add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); } + + se->avg.runnable_load_sum = runnable_sum; + se->avg.runnable_load_avg = runnable_load_avg; } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) -- cgit v1.2.3 From 4c58f57fa6e93318a0899f70d8b99fe6bac22ce8 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Sat, 4 Jan 2020 21:08:28 +0800 Subject: sched/fair: Fix sgc->{min,max}_capacity calculation for SD_OVERLAP commit bf475ce0a3dd ("sched/fair: Add per-CPU min capacity to sched_group_capacity") introduced per-cpu min_capacity. commit e3d6d0cb66f2 ("sched/fair: Add sched_group per-CPU max capacity") introduced per-cpu max_capacity. In the SD_OVERLAP case, the local variable 'capacity' represents the sum of CPU capacity of all CPUs in the first sched group (sg) of the sched domain (sd). It is erroneously used to calculate sg's min and max CPU capacity. To fix this use capacity_of(cpu) instead of 'capacity'. The code which achieves this via cpu_rq(cpu)->sd->groups->sgc->capacity (for rq->sd != NULL) can be removed since it delivers the same value as capacity_of(cpu) which is currently only used for the (!rq->sd) case (see update_cpu_capacity()). An sg of the lowest sd (rq->sd or sd->child == NULL) represents a single CPU (and hence sg->sgc->capacity == capacity_of(cpu)). Signed-off-by: Peng Liu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20200104130828.GA7718@iZj6chx1xj0e0buvshuecpZ --- kernel/sched/fair.c | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 32c5421b6a25..e84723c5c661 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7802,29 +7802,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu) */ for_each_cpu(cpu, sched_group_span(sdg)) { - struct sched_group_capacity *sgc; - struct rq *rq = cpu_rq(cpu); + unsigned long cpu_cap = capacity_of(cpu); - /* - * build_sched_domains() -> init_sched_groups_capacity() - * gets here before we've attached the domains to the - * runqueues. - * - * Use capacity_of(), which is set irrespective of domains - * in update_cpu_capacity(). - * - * This avoids capacity from being 0 and - * causing divide-by-zero issues on boot. - */ - if (unlikely(!rq->sd)) { - capacity += capacity_of(cpu); - } else { - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; - } - - min_capacity = min(capacity, min_capacity); - max_capacity = max(capacity, max_capacity); + capacity += cpu_cap; + min_capacity = min(cpu_cap, min_capacity); + max_capacity = max(cpu_cap, max_capacity); } } else { /* -- cgit v1.2.3 From 3d817689a62cf71bbb290af18cd26cf9764f38fe Mon Sep 17 00:00:00 2001 From: Wang Long Date: Wed, 18 Dec 2019 20:38:18 +0800 Subject: sched/psi: create /proc/pressure and /proc/pressure/{io|memory|cpu} only when psi enabled when CONFIG_PSI_DEFAULT_DISABLED set to N or the command line set psi=0, I think we should not create /proc/pressure and /proc/pressure/{io|memory|cpu}. In the future, user maybe determine whether the psi feature is enabled by checking the existence of the /proc/pressure dir or /proc/pressure/{io|memory|cpu} files. Signed-off-by: Wang Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/1576672698-32504-1-git-send-email-w@laoqinren.net --- kernel/sched/psi.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ce8f6748678a..db7b50bba3f1 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1280,10 +1280,12 @@ static const struct file_operations psi_cpu_fops = { static int __init psi_proc_init(void) { - proc_mkdir("pressure", NULL); - proc_create("pressure/io", 0, NULL, &psi_io_fops); - proc_create("pressure/memory", 0, NULL, &psi_memory_fops); - proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + if (psi_enable) { + proc_mkdir("pressure", NULL); + proc_create("pressure/io", 0, NULL, &psi_io_fops); + proc_create("pressure/memory", 0, NULL, &psi_memory_fops); + proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + } return 0; } module_init(psi_proc_init); -- cgit v1.2.3 From a4f9a0e51bbf89cb461b1985a1a570e6b87da3b5 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 15 Jan 2020 11:20:20 +0100 Subject: sched/fair: Remove redundant call to cpufreq_update_util() With commit bef69dd87828 ("sched/cpufreq: Move the cfs_rq_util_change() call to cpufreq_update_util()") update_load_avg() has become the central point for calling cpufreq (not including the update of blocked load). This change helps to simplify further the number of calls to cpufreq_update_util() and to remove last redundant ones. With update_load_avg(), we are now sure that cpufreq_update_util() will be called after every task attachment to a cfs_rq and especially after propagating this event down to the util_avg of the root cfs_rq, which is the level that is used by cpufreq governors like schedutil to set the frequency of a CPU. The SCHED_CPUFREQ_MIGRATION flag forces an early call to cpufreq when the migration happens in a cgroup whereas util_avg of root cfs_rq is not yet updated and this call is duplicated with the one that happens immediately after when the migration event reaches the root cfs_rq. The dedicated flag SCHED_CPUFREQ_MIGRATION is now useless and can be removed. The interface of attach_entity_load_avg() can also be simplified accordingly. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/1579083620-24943-1-git-send-email-vincent.guittot@linaro.org --- kernel/sched/fair.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e84723c5c661..ebf50955fe8a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -801,7 +801,7 @@ void post_init_entity_util_avg(struct task_struct *p) * For !fair tasks do: * update_cfs_rq_load_avg(now, cfs_rq); - attach_entity_load_avg(cfs_rq, se, 0); + attach_entity_load_avg(cfs_rq, se); switched_from_fair(rq, p); * * such that the next switched_to_fair() has the @@ -3114,7 +3114,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) { struct rq *rq = rq_of(cfs_rq); - if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) { + if (&rq->cfs == cfs_rq) { /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be @@ -3521,7 +3521,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. */ -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; @@ -3557,7 +3557,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); - cfs_rq_util_change(cfs_rq, flags); + cfs_rq_util_change(cfs_rq, 0); trace_pelt_cfs_tp(cfs_rq); } @@ -3615,7 +3615,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * * IOW we're enqueueing a task on a new CPU. */ - attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION); + attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, 0); } else if (decayed) { @@ -3872,7 +3872,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline void -attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} @@ -10436,7 +10436,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); - attach_entity_load_avg(cfs_rq, se, 0); + attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); propagate_entity_cfs_rq(se); } -- cgit v1.2.3 From 3e0de271fff77abb933f1b69c213854c3eda9125 Mon Sep 17 00:00:00 2001 From: Hewenliang Date: Thu, 9 Jan 2020 21:56:04 -0500 Subject: idle: fix spelling mistake "iterrupts" -> "interrupts" There is a spelling misake in comments of cpuidle_idle_call. Fix it. Signed-off-by: Hewenliang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/20200110025604.34373-1-hewenliang4@huawei.com --- kernel/sched/idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ffa959e91227..b743bf38f08f 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -158,7 +158,7 @@ static void cpuidle_idle_call(void) /* * Suspend-to-idle ("s2idle") is a system state in which all user space * has been frozen, all I/O devices have been suspended and the only - * activity happens here and in iterrupts (if any). In that case bypass + * activity happens here and in interrupts (if any). In that case bypass * the cpuidle governor and go stratight for the deepest idle state * available. Possibly also suspend the local tick and the entire * timekeeping to prevent timer interrupts from kicking us out of idle -- cgit v1.2.3 From ccf74128d66ce937876184ad55db2e0276af08d3 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 15 Jan 2020 16:09:15 +0000 Subject: sched/topology: Assert non-NUMA topology masks don't (partially) overlap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit topology.c::get_group() relies on the assumption that non-NUMA domains do not partially overlap. Zeng Tao pointed out in [1] that such topology descriptions, while completely bogus, can end up being exposed to the scheduler. In his example (8 CPUs, 2-node system), we end up with: MC span for CPU3 == 3-7 MC span for CPU4 == 4-7 The first pass through get_group(3, sdd@MC) will result in the following sched_group list: 3 -> 4 -> 5 -> 6 -> 7 ^ / `----------------' And a later pass through get_group(4, sdd@MC) will "corrupt" that to: 3 -> 4 -> 5 -> 6 -> 7 ^ / `-----------' which will completely break things like 'while (sg != sd->groups)' when using CPU3's base sched_domain. There already are some architecture-specific checks in place such as x86/kernel/smpboot.c::topology.sane(), but this is something we can detect in the core scheduler, so it seems worthwhile to do so. Warn and abort the construction of the sched domains if such a broken topology description is detected. Note that this is somewhat expensive (O(t.c²), 't' non-NUMA topology levels and 'c' CPUs) and could be gated under SCHED_DEBUG if deemed necessary. Testing ======= Dietmar managed to reproduce this using the following qemu incantation: $ qemu-system-aarch64 -kernel ./Image -hda ./qemu-image-aarch64.img \ -append 'root=/dev/vda console=ttyAMA0 loglevel=8 sched_debug' -smp \ cores=8 --nographic -m 512 -cpu cortex-a53 -machine virt -numa \ node,cpus=0-2,nodeid=0 -numa node,cpus=3-7,nodeid=1 alongside the following drivers/base/arch_topology.c hack (AIUI wouldn't be needed if '-smp cores=X, sockets=Y' would work with qemu): 8<--- @@ -465,6 +465,9 @@ void update_siblings_masks(unsigned int cpuid) if (cpuid_topo->package_id != cpu_topo->package_id) continue; + if ((cpu < 4 && cpuid > 3) || (cpu > 3 && cpuid < 4)) + continue; + cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); cpumask_set_cpu(cpu, &cpuid_topo->core_sibling); 8<--- [1]: https://lkml.kernel.org/r/1577088979-8545-1-git-send-email-prime.zeng@hisilicon.com Reported-by: Zeng Tao Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200115160915.22575-1-valentin.schneider@arm.com --- kernel/sched/topology.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6ec1e595b1d4..dfb64c08a407 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1879,6 +1879,42 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve return sd; } +/* + * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for + * any two given CPUs at this (non-NUMA) topology level. + */ +static bool topology_span_sane(struct sched_domain_topology_level *tl, + const struct cpumask *cpu_map, int cpu) +{ + int i; + + /* NUMA levels are allowed to overlap */ + if (tl->flags & SDTL_OVERLAP) + return true; + + /* + * Non-NUMA levels cannot partially overlap - they must be either + * completely equal or completely disjoint. Otherwise we can end up + * breaking the sched_group lists - i.e. a later get_group() pass + * breaks the linking done for an earlier span. + */ + for_each_cpu(i, cpu_map) { + if (i == cpu) + continue; + /* + * We should 'and' all those masks with 'cpu_map' to exactly + * match the topology we're about to build, but that can only + * remove CPUs, which only lessens our ability to detect + * overlaps + */ + if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && + cpumask_intersects(tl->mask(cpu), tl->mask(i))) + return false; + } + + return true; +} + /* * Find the sched_domain_topology_level where all CPU capacities are visible * for all CPUs. @@ -1975,6 +2011,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att has_asym = true; } + if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) + goto error; + sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); if (tl == sched_domain_topology) -- cgit v1.2.3 From 39e7234f00bc93613c086ae42d852d5f4147120a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 15 Jan 2020 10:43:36 -0500 Subject: locking/rwsem: Fix kernel crash when spinning on RWSEM_OWNER_UNKNOWN The commit 91d2a812dfb9 ("locking/rwsem: Make handoff writer optimistically spin on owner") will allow a recently woken up waiting writer to spin on the owner. Unfortunately, if the owner happens to be RWSEM_OWNER_UNKNOWN, the code will incorrectly spin on it leading to a kernel crash. This is fixed by passing the proper non-spinnable bits to rwsem_spin_on_owner() so that RWSEM_OWNER_UNKNOWN will be treated as a non-spinnable target. Fixes: 91d2a812dfb9 ("locking/rwsem: Make handoff writer optimistically spin on owner") Reported-by: Christoph Hellwig Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Tested-by: Christoph Hellwig Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200115154336.8679-1-longman@redhat.com --- kernel/locking/rwsem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 44e68761f432..0d9b6be9ecc8 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1226,8 +1226,8 @@ wait: * In this case, we attempt to acquire the lock again * without sleeping. */ - if ((wstate == WRITER_HANDOFF) && - (rwsem_spin_on_owner(sem, 0) == OWNER_NULL)) + if (wstate == WRITER_HANDOFF && + rwsem_spin_on_owner(sem, RWSEM_NONSPINNABLE) == OWNER_NULL) goto trylock_again; /* Block until there are no active lockers. */ -- cgit v1.2.3 From a030f9767da1a6bbcec840fc54770eb11c2414b6 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 11 Dec 2019 16:31:39 -0500 Subject: locking/lockdep: Fix lockdep_stats indentation problem It was found that two lines in the output of /proc/lockdep_stats have indentation problem: # cat /proc/lockdep_stats : in-process chains: 25057 stack-trace entries: 137827 [max: 524288] number of stack traces: 7973 number of stack hash chains: 6355 combined max dependencies: 1356414598 hardirq-safe locks: 57 hardirq-unsafe locks: 1286 : All the numbers displayed in /proc/lockdep_stats except the two stack trace numbers are formatted with a field with of 11. To properly align all the numbers, a field width of 11 is now added to the two stack trace numbers. Fixes: 8c779229d0f4 ("locking/lockdep: Report more stack trace statistics") Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Bart Van Assche Link: https://lkml.kernel.org/r/20191211213139.29934-1-longman@redhat.com --- kernel/locking/lockdep_proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index dadb7b7fba37..9bb6d2497b04 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -286,9 +286,9 @@ static int lockdep_stats_show(struct seq_file *m, void *v) seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n", nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES); #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) - seq_printf(m, " number of stack traces: %llu\n", + seq_printf(m, " number of stack traces: %11llu\n", lockdep_stack_trace_count()); - seq_printf(m, " number of stack hash chains: %llu\n", + seq_printf(m, " number of stack hash chains: %11llu\n", lockdep_stack_hash_count()); #endif seq_printf(m, " combined max dependencies: %11u\n", -- cgit v1.2.3 From 57097124cbbd310cc2b5884189e22e60a3c20514 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 7 Jan 2020 12:49:14 -0500 Subject: locking/qspinlock: Fix inaccessible URL of MCS lock paper It turns out that the URL of the MCS lock paper listed in the source code is no longer accessible. I did got question about where the paper was. This patch updates the URL to BZ 206115 which contains a copy of the paper from https://www.cs.rochester.edu/u/scott/papers/1991_TOCS_synch.pdf Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Link: https://lkml.kernel.org/r/20200107174914.4187-1-longman@redhat.com --- kernel/locking/qspinlock.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 2473f10c6956..b9515fcc9b29 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -31,14 +31,15 @@ /* * The basic principle of a queue-based spinlock can best be understood * by studying a classic queue-based spinlock implementation called the - * MCS lock. The paper below provides a good description for this kind - * of lock. + * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable + * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and + * Scott") is available at * - * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf + * https://bugzilla.kernel.org/show_bug.cgi?id=206115 * - * This queued spinlock implementation is based on the MCS lock, however to make - * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing - * API, we must modify it somehow. + * This queued spinlock implementation is based on the MCS lock, however to + * make it fit the 4 bytes we assume spinlock_t to be, and preserve its + * existing API, we must modify it somehow. * * In particular; where the traditional MCS lock consists of a tail pointer * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to -- cgit v1.2.3 From f5bfdc8e3947a7ae489cf8ae9cfd6b3fb357b952 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 13 Jan 2020 10:07:35 -0500 Subject: locking/osq: Use optimized spinning loop for arm64 Arm64 has a more optimized spinning loop (atomic_cond_read_acquire) using wfe for spinlock that can boost performance of sibling threads by putting the current cpu to a wait state that is broken only when the monitored variable changes or an external event happens. OSQ has a more complicated spinning loop. Besides the lock value, it also checks for need_resched() and vcpu_is_preempted(). The check for need_resched() is not a problem as it is only set by the tick interrupt handler. That will be detected by the spinning cpu right after iret. The vcpu_is_preempted() check, however, is a problem as changes to the preempt state of of previous node will not affect the wait state. For ARM64, vcpu_is_preempted is not currently defined and so is a no-op. Will has indicated that he is planning to para-virtualize wfe instead of defining vcpu_is_preempted for PV support. So just add a comment in arch/arm64/include/asm/spinlock.h to indicate that vcpu_is_preempted() should not be defined as suggested. On a 2-socket 56-core 224-thread ARM64 system, a kernel mutex locking microbenchmark was run for 10s with and without the patch. The performance numbers before patch were: Running locktest with mutex [runtime = 10s, load = 1] Threads = 224, Min/Mean/Max = 316/123,143/2,121,269 Threads = 224, Total Rate = 2,757 kop/s; Percpu Rate = 12 kop/s After patch, the numbers were: Running locktest with mutex [runtime = 10s, load = 1] Threads = 224, Min/Mean/Max = 334/147,836/1,304,787 Threads = 224, Total Rate = 3,311 kop/s; Percpu Rate = 15 kop/s So there was about 20% performance improvement. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Link: https://lkml.kernel.org/r/20200113150735.21956-1-longman@redhat.com --- kernel/locking/osq_lock.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 6ef600aa0f47..1f7734949ac8 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -134,20 +134,17 @@ bool osq_lock(struct optimistic_spin_queue *lock) * cmpxchg in an attempt to undo our queueing. */ - while (!READ_ONCE(node->locked)) { - /* - * If we need to reschedule bail... so we can block. - * Use vcpu_is_preempted() to avoid waiting for a preempted - * lock holder: - */ - if (need_resched() || vcpu_is_preempted(node_cpu(node->prev))) - goto unqueue; - - cpu_relax(); - } - return true; + /* + * Wait to acquire the lock or cancelation. Note that need_resched() + * will come with an IPI, which will wake smp_cond_load_relaxed() if it + * is implemented with a monitor-wait. vcpu_is_preempted() relies on + * polling, be careful. + */ + if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() || + vcpu_is_preempted(node_cpu(node->prev)))) + return true; -unqueue: + /* unqueue */ /* * Step - A -- stabilize @prev * -- cgit v1.2.3 From 11e31f608b499f044f24b20be73f1dcab3e43f8a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Jan 2020 19:17:02 +0100 Subject: watchdog/softlockup: Enforce that timestamp is valid on boot Robert reported that during boot the watchdog timestamp is set to 0 for one second which is the indicator for a watchdog reset. The reason for this is that the timestamp is in seconds and the time is taken from sched clock and divided by ~1e9. sched clock starts at 0 which means that for the first second during boot the watchdog timestamp is 0, i.e. reset. Use ULONG_MAX as the reset indicator value so the watchdog works correctly right from the start. ULONG_MAX would only conflict with a real timestamp if the system reaches an uptime of 136 years on 32bit and almost eternity on 64bit. Reported-by: Robert Richter Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/87o8v3uuzl.fsf@nanos.tec.linutronix.de --- kernel/watchdog.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e3774e9625b7..b6b1f54a7837 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -161,6 +161,8 @@ static void lockup_detector_update_enable(void) #ifdef CONFIG_SOFTLOCKUP_DETECTOR +#define SOFTLOCKUP_RESET ULONG_MAX + /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; @@ -272,7 +274,7 @@ notrace void touch_softlockup_watchdog_sched(void) * Preemption can be enabled. It doesn't matter which CPU's timestamp * gets zeroed here, so use the raw_ operation. */ - raw_cpu_write(watchdog_touch_ts, 0); + raw_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET); } notrace void touch_softlockup_watchdog(void) @@ -296,14 +298,14 @@ void touch_all_softlockup_watchdogs(void) * the softlockup check. */ for_each_cpu(cpu, &watchdog_allowed_mask) - per_cpu(watchdog_touch_ts, cpu) = 0; + per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET; wq_watchdog_touch(-1); } void touch_softlockup_watchdog_sync(void) { __this_cpu_write(softlockup_touch_sync, true); - __this_cpu_write(watchdog_touch_ts, 0); + __this_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET); } static int is_softlockup(unsigned long touch_ts) @@ -379,7 +381,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); - if (touch_ts == 0) { + if (touch_ts == SOFTLOCKUP_RESET) { if (unlikely(__this_cpu_read(softlockup_touch_sync))) { /* * If the time stamp was touched atomically -- cgit v1.2.3 From da9ec3d3dd0f1240a48920be063448a2242dbd90 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 6 Jan 2020 12:03:39 +0000 Subject: perf: Correctly handle failed perf_get_aux_event() Vince reports a worrying issue: | so I was tracking down some odd behavior in the perf_fuzzer which turns | out to be because perf_even_open() sometimes returns 0 (indicating a file | descriptor of 0) even though as far as I can tell stdin is still open. ... and further the cause: | error is triggered if aux_sample_size has non-zero value. | | seems to be this line in kernel/events/core.c: | | if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) | goto err_locked; | | (note, err is never set) This seems to be a thinko in commit: ab43762ef010967e ("perf: Allow normal events to output AUX data") ... and we should probably return -EINVAL here, as this should only happen when the new event is mis-configured or does not have a compatible aux_event group leader. Fixes: ab43762ef010967e ("perf: Allow normal events to output AUX data") Reported-by: Vince Weaver Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Alexander Shishkin Tested-by: Vince Weaver --- kernel/events/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a1f8bde19b56..2173c23c25b4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11465,8 +11465,10 @@ SYSCALL_DEFINE5(perf_event_open, } } - if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) + if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { + err = -EINVAL; goto err_locked; + } /* * Must be under the same ctx::mutex as perf_install_in_context(), -- cgit v1.2.3 From 9a6b55ac4a44060bcb782baf002859b2a2c63267 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Jan 2020 19:52:38 +0100 Subject: lib/vdso: Make __arch_update_vdso_data() logic understandable The function name suggests that this is a boolean checking whether the architecture asks for an update of the VDSO data, but it works the other way round. To spare further confusion invert the logic. Fixes: 44f57d788e7d ("timekeeping: Provide a generic update_vsyscall() implementation") Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200114185946.656652824@linutronix.de --- kernel/time/vsyscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 5ee0f7709410..f0aab6182824 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -84,7 +84,7 @@ void update_vsyscall(struct timekeeper *tk) struct vdso_timestamp *vdso_ts; u64 nsec; - if (__arch_update_vdso_data()) { + if (!__arch_update_vdso_data()) { /* * Some architectures might want to skip the update of the * data page. -- cgit v1.2.3 From 9f24c540f7f8eb3a981528da9a9a636a5bdf5987 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Jan 2020 19:52:39 +0100 Subject: lib/vdso: Update coarse timekeeper unconditionally The low resolution parts of the VDSO, i.e.: clock_gettime(CLOCK_*_COARSE), clock_getres(), time() can be used even if there is no VDSO capable clocksource. But if an architecture opts out of the VDSO data update then this information becomes stale. This affects ARM when there is no architected timer available. The lack of update causes userspace to use stale data forever. Make the update of the low resolution parts unconditional and only skip the update of the high resolution parts if the architecture requests it. Fixes: 44f57d788e7d ("timekeeping: Provide a generic update_vsyscall() implementation") Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200114185946.765577901@linutronix.de --- kernel/time/vsyscall.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index f0aab6182824..9577c89179cd 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -28,11 +28,6 @@ static inline void update_vdso_data(struct vdso_data *vdata, vdata[CS_RAW].mult = tk->tkr_raw.mult; vdata[CS_RAW].shift = tk->tkr_raw.shift; - /* CLOCK_REALTIME */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; - vdso_ts->sec = tk->xtime_sec; - vdso_ts->nsec = tk->tkr_mono.xtime_nsec; - /* CLOCK_MONOTONIC */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; @@ -70,12 +65,6 @@ static inline void update_vdso_data(struct vdso_data *vdata, vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; - - /* - * Read without the seqlock held by clock_getres(). - * Note: No need to have a second copy. - */ - WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); } void update_vsyscall(struct timekeeper *tk) @@ -84,20 +73,17 @@ void update_vsyscall(struct timekeeper *tk) struct vdso_timestamp *vdso_ts; u64 nsec; - if (!__arch_update_vdso_data()) { - /* - * Some architectures might want to skip the update of the - * data page. - */ - return; - } - /* copy vsyscall data */ vdso_write_begin(vdata); vdata[CS_HRES_COARSE].clock_mode = __arch_get_clock_mode(tk); vdata[CS_RAW].clock_mode = __arch_get_clock_mode(tk); + /* CLOCK_REALTIME also required for time() */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; + vdso_ts->sec = tk->xtime_sec; + vdso_ts->nsec = tk->tkr_mono.xtime_nsec; + /* CLOCK_REALTIME_COARSE */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; @@ -110,7 +96,18 @@ void update_vsyscall(struct timekeeper *tk) nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); - update_vdso_data(vdata, tk); + /* + * Read without the seqlock held by clock_getres(). + * Note: No need to have a second copy. + */ + WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); + + /* + * Architectures can opt out of updating the high resolution part + * of the VDSO. + */ + if (__arch_update_vdso_data()) + update_vdso_data(vdata, tk); __arch_update_vsyscall(vdata, tk); -- cgit v1.2.3 From 6b3ad6649a4c75504edeba242d3fd36b3096a57f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 15 Jan 2020 14:42:34 +0100 Subject: ptrace: reintroduce usage of subjective credentials in ptrace_has_cap() Commit 69f594a38967 ("ptrace: do not audit capability check when outputing /proc/pid/stat") introduced the ability to opt out of audit messages for accesses to various proc files since they are not violations of policy. While doing so it somehow switched the check from ns_capable() to has_ns_capability{_noaudit}(). That means it switched from checking the subjective credentials of the task to using the objective credentials. This is wrong since. ptrace_has_cap() is currently only used in ptrace_may_access() And is used to check whether the calling task (subject) has the CAP_SYS_PTRACE capability in the provided user namespace to operate on the target task (object). According to the cred.h comments this would mean the subjective credentials of the calling task need to be used. This switches ptrace_has_cap() to use security_capable(). Because we only call ptrace_has_cap() in ptrace_may_access() and in there we already have a stable reference to the calling task's creds under rcu_read_lock() there's no need to go through another series of dereferences and rcu locking done in ns_capable{_noaudit}(). As one example where this might be particularly problematic, Jann pointed out that in combination with the upcoming IORING_OP_OPENAT feature, this bug might allow unprivileged users to bypass the capability checks while asynchronously opening files like /proc/*/mem, because the capability checks for this would be performed against kernel credentials. To illustrate on the former point about this being exploitable: When io_uring creates a new context it records the subjective credentials of the caller. Later on, when it starts to do work it creates a kernel thread and registers a callback. The callback runs with kernel creds for ktask->real_cred and ktask->cred. To prevent this from becoming a full-blown 0-day io_uring will call override_cred() and override ktask->cred with the subjective credentials of the creator of the io_uring instance. With ptrace_has_cap() currently looking at ktask->real_cred this override will be ineffective and the caller will be able to open arbitray proc files as mentioned above. Luckily, this is currently not exploitable but will turn into a 0-day once IORING_OP_OPENAT{2} land in v5.6. Fix it now! Cc: Oleg Nesterov Cc: Eric Paris Cc: stable@vger.kernel.org Reviewed-by: Kees Cook Reviewed-by: Serge Hallyn Reviewed-by: Jann Horn Fixes: 69f594a38967 ("ptrace: do not audit capability check when outputing /proc/pid/stat") Signed-off-by: Christian Brauner --- kernel/ptrace.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index cb9ddcc08119..43d6179508d6 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -264,12 +264,17 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) return ret; } -static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) +static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns, + unsigned int mode) { + int ret; + if (mode & PTRACE_MODE_NOAUDIT) - return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE); + ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); else - return has_ns_capability(current, ns, CAP_SYS_PTRACE); + ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE); + + return ret == 0; } /* Returns 0 on success, -errno on denial. */ @@ -321,7 +326,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) gid_eq(caller_gid, tcred->sgid) && gid_eq(caller_gid, tcred->gid)) goto ok; - if (ptrace_has_cap(tcred->user_ns, mode)) + if (ptrace_has_cap(cred, tcred->user_ns, mode)) goto ok; rcu_read_unlock(); return -EPERM; @@ -340,7 +345,7 @@ ok: mm = task->mm; if (mm && ((get_dumpable(mm) != SUID_DUMP_USER) && - !ptrace_has_cap(mm->user_ns, mode))) + !ptrace_has_cap(cred, mm->user_ns, mode))) return -EPERM; return security_ptrace_access_check(task, mode); -- cgit v1.2.3 From 87c9366e17259040a9118e06b6dc8de986e5d3d1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 4 Dec 2019 17:43:46 +0100 Subject: Revert "um: Enable CONFIG_CONSTRUCTORS" This reverts commit 786b2384bf1c ("um: Enable CONFIG_CONSTRUCTORS"). There are two issues with this commit, uncovered by Anton in tests on some (Debian) systems: 1) I completely forgot to call any constructors if CONFIG_CONSTRUCTORS isn't set. Don't recall now if it just wasn't needed on my system, or if I never tested this case. 2) With that fixed, it works - with CONFIG_CONSTRUCTORS *unset*. If I set CONFIG_CONSTRUCTORS, it fails again, which isn't totally unexpected since whatever wanted to run is likely to have to run before the kernel init etc. that calls the constructors in this case. Basically, some constructors that gcc emits (libc has?) need to run very early during init; the failure mode otherwise was that the ptrace fork test already failed: ---------------------- $ ./linux mem=512M Core dump limits : soft - 0 hard - NONE Checking that ptrace can change system call numbers...check_ptrace : child exited with exitcode 6, while expecting 0; status 0x67f Aborted ---------------------- Thinking more about this, it's clear that we simply cannot support CONFIG_CONSTRUCTORS in UML. All the cases we need now (gcov, kasan) involve not use of the __attribute__((constructor)), but instead some constructor code/entry generated by gcc. Therefore, we cannot distinguish between kernel constructors and system constructors. Thus, revert this commit. Cc: stable@vger.kernel.org [5.4+] Fixes: 786b2384bf1c ("um: Enable CONFIG_CONSTRUCTORS") Reported-by: Anton Ivanov Signed-off-by: Johannes Berg Acked-by: Anton Ivanov Signed-off-by: Richard Weinberger --- kernel/gcov/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 060e8e726755..3941a9c48f83 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,7 +4,7 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS - select CONSTRUCTORS + select CONSTRUCTORS if !UML default n ---help--- This option enables gcov-based code profiling (e.g. for code coverage -- cgit v1.2.3 From afa70d941f663c69c9a64ec1021bbcfa82f0e54a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 20 Jan 2020 11:29:05 +0530 Subject: sched/fair: Define sched_idle_cpu() only for SMP configurations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sched_idle_cpu() isn't used for non SMP configuration and with a recent change, we have started getting following warning: kernel/sched/fair.c:5221:12: warning: ‘sched_idle_cpu’ defined but not used [-Wunused-function] Fix that by defining sched_idle_cpu() only for SMP configurations. Fixes: 323af6deaf70 ("sched/fair: Load balance aggressively for SCHED_IDLE CPUs") Reported-by: Stephen Rothwell Signed-off-by: Viresh Kumar Signed-off-by: Ingo Molnar Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Link: https://lore.kernel.org/r/f0554f590687478b33914a4aff9f0e6a62886d44.1579499907.git.viresh.kumar@linaro.org --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ebf50955fe8a..fe4e0d775375 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5218,10 +5218,12 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } +#ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { return sched_idle_rq(cpu_rq(cpu)); } +#endif /* * The enqueue_task method is called before nr_running is -- cgit v1.2.3 From b01ecceaf2c0c4b3f2d24aa0adcf096ab1648253 Mon Sep 17 00:00:00 2001 From: Yash Shah Date: Tue, 10 Dec 2019 16:41:09 +0530 Subject: genirq: Introduce irq_domain_translate_onecell Add a new function irq_domain_translate_onecell() that is to be used as the translate function in struct irq_domain_ops. Signed-off-by: Yash Shah Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1575976274-13487-2-git-send-email-yash.shah@sifive.com --- kernel/irq/irqdomain.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index dd822fd8a7d5..7a8808c8dfec 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -986,6 +986,23 @@ const struct irq_domain_ops irq_domain_simple_ops = { }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); +/** + * irq_domain_translate_onecell() - Generic translate for direct one cell + * bindings + */ +int irq_domain_translate_onecell(struct irq_domain *d, + struct irq_fwspec *fwspec, + unsigned long *out_hwirq, + unsigned int *out_type) +{ + if (WARN_ON(fwspec->param_count < 1)) + return -EINVAL; + *out_hwirq = fwspec->param[0]; + *out_type = IRQ_TYPE_NONE; + return 0; +} +EXPORT_SYMBOL_GPL(irq_domain_translate_onecell); + /** * irq_domain_translate_twocell() - Generic translate for direct two cell * bindings -- cgit v1.2.3 From 708e0ada1916be765b7faa58854062f2bc620bbf Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Fri, 17 Jan 2020 13:32:21 +0100 Subject: module: avoid setting info->name early in case we can fall back to info->mod->name In setup_load_info(), info->name (which contains the name of the module, mostly used for early logging purposes before the module gets set up) gets unconditionally assigned if .modinfo is missing despite the fact that there is an if (!info->name) check near the end of the function. Avoid assigning a placeholder string to info->name if .modinfo doesn't exist, so that we can fall back to info->mod->name later on. Fixes: 5fdc7db6448a ("module: setup load info before module_sig_check()") Reviewed-by: Miroslav Benes Signed-off-by: Jessica Yu --- kernel/module.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 2b5f9c4748bc..f2379e54fae8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3059,9 +3059,7 @@ static int setup_load_info(struct load_info *info, int flags) /* Try to find a name early so we can log errors with a module name */ info->index.info = find_sec(info, ".modinfo"); - if (!info->index.info) - info->name = "(missing .modinfo section)"; - else + if (info->index.info) info->name = get_modinfo(info, "name"); /* Find internal symbols and strings. */ @@ -3076,14 +3074,15 @@ static int setup_load_info(struct load_info *info, int flags) } if (info->index.sym == 0) { - pr_warn("%s: module has no symbols (stripped?)\n", info->name); + pr_warn("%s: module has no symbols (stripped?)\n", + info->name ?: "(missing .modinfo section or name field)"); return -ENOEXEC; } info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); if (!info->index.mod) { pr_warn("%s: No module found in object\n", - info->name ?: "(missing .modinfo name field)"); + info->name ?: "(missing .modinfo section or name field)"); return -ENOEXEC; } /* This is temporary: point mod into copy of data. */ -- cgit v1.2.3 From 0f394daef89b38d58c91118a2b08b8a1b316703b Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Mon, 20 Jan 2020 12:35:47 +0800 Subject: irqdomain: Fix a memory leak in irq_domain_push_irq() Fix a memory leak reported by kmemleak: unreferenced object 0xffff000bc6f50e80 (size 128): comm "kworker/23:2", pid 201, jiffies 4294894947 (age 942.132s) hex dump (first 32 bytes): 00 00 00 00 41 00 00 00 86 c0 03 00 00 00 00 00 ....A........... 00 a0 b2 c6 0b 00 ff ff 40 51 fd 10 00 80 ff ff ........@Q...... backtrace: [<00000000e62d2240>] kmem_cache_alloc_trace+0x1a4/0x320 [<00000000279143c9>] irq_domain_push_irq+0x7c/0x188 [<00000000d9f4c154>] thunderx_gpio_probe+0x3ac/0x438 [<00000000fd09ec22>] pci_device_probe+0xe4/0x198 [<00000000d43eca75>] really_probe+0xdc/0x320 [<00000000d3ebab09>] driver_probe_device+0x5c/0xf0 [<000000005b3ecaa0>] __device_attach_driver+0x88/0xc0 [<000000004e5915f5>] bus_for_each_drv+0x7c/0xc8 [<0000000079d4db41>] __device_attach+0xe4/0x140 [<00000000883bbda9>] device_initial_probe+0x18/0x20 [<000000003be59ef6>] bus_probe_device+0x98/0xa0 [<0000000039b03d3f>] deferred_probe_work_func+0x74/0xa8 [<00000000870934ce>] process_one_work+0x1c8/0x470 [<00000000e3cce570>] worker_thread+0x1f8/0x428 [<000000005d64975e>] kthread+0xfc/0x128 [<00000000f0eaa764>] ret_from_fork+0x10/0x18 Fixes: 495c38d3001f ("irqdomain: Add irq_domain_{push,pop}_irq() functions") Signed-off-by: Kevin Hao Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200120043547.22271-1-haokexin@gmail.com --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 7a8808c8dfec..7527e5ef6fe5 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1476,6 +1476,7 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg) if (rv) { /* Restore the original irq_data. */ *root_irq_data = *child_irq_data; + kfree(child_irq_data); goto error; } -- cgit v1.2.3 From 8bcebc77e85f3d7536f96845a0fe94b1dddb6af0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 20 Jan 2020 13:07:31 -0500 Subject: tracing: Fix histogram code when expression has same var as value While working on a tool to convert SQL syntex into the histogram language of the kernel, I discovered the following bug: # echo 'first u64 start_time u64 end_time pid_t pid u64 delta' >> synthetic_events # echo 'hist:keys=pid:start=common_timestamp' > events/sched/sched_waking/trigger # echo 'hist:keys=next_pid:delta=common_timestamp-$start,start2=$start:onmatch(sched.sched_waking).trace(first,$start2,common_timestamp,next_pid,$delta)' > events/sched/sched_switch/trigger Would not display any histograms in the sched_switch histogram side. But if I were to swap the location of "delta=common_timestamp-$start" with "start2=$start" Such that the last line had: # echo 'hist:keys=next_pid:start2=$start,delta=common_timestamp-$start:onmatch(sched.sched_waking).trace(first,$start2,common_timestamp,next_pid,$delta)' > events/sched/sched_switch/trigger The histogram works as expected. What I found out is that the expressions clear out the value once it is resolved. As the variables are resolved in the order listed, when processing: delta=common_timestamp-$start The $start is cleared. When it gets to "start2=$start", it errors out with "unresolved symbol" (which is silent as this happens at the location of the trace), and the histogram is dropped. When processing the histogram for variable references, instead of adding a new reference for a variable used twice, use the same reference. That way, not only is it more efficient, but the order will no longer matter in processing of the variables. From Tom Zanussi: "Just to clarify some more about what the problem was is that without your patch, we would have two separate references to the same variable, and during resolve_var_refs(), they'd both want to be resolved separately, so in this case, since the first reference to start wasn't part of an expression, it wouldn't get the read-once flag set, so would be read normally, and then the second reference would do the read-once read and also be read but using read-once. So everything worked and you didn't see a problem: from: start2=$start,delta=common_timestamp-$start In the second case, when you switched them around, the first reference would be resolved by doing the read-once, and following that the second reference would try to resolve and see that the variable had already been read, so failed as unset, which caused it to short-circuit out and not do the trigger action to generate the synthetic event: to: delta=common_timestamp-$start,start2=$start With your patch, we only have the single resolution which happens correctly the one time it's resolved, so this can't happen." Link: https://lore.kernel.org/r/20200116154216.58ca08eb@gandalf.local.home Cc: stable@vger.kernel.org Fixes: 067fe038e70f6 ("tracing: Add variable reference handling to hist triggers") Reviewed-by: Tom Zanuss Tested-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index d33b046f985a..6ac35b9e195d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -116,6 +116,7 @@ struct hist_field { struct ftrace_event_field *field; unsigned long flags; hist_field_fn_t fn; + unsigned int ref; unsigned int size; unsigned int offset; unsigned int is_signed; @@ -2427,8 +2428,16 @@ static int contains_operator(char *str) return field_op; } +static void get_hist_field(struct hist_field *hist_field) +{ + hist_field->ref++; +} + static void __destroy_hist_field(struct hist_field *hist_field) { + if (--hist_field->ref > 1) + return; + kfree(hist_field->var.name); kfree(hist_field->name); kfree(hist_field->type); @@ -2470,6 +2479,8 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (!hist_field) return NULL; + hist_field->ref = 1; + hist_field->hist_data = hist_data; if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS) @@ -2665,6 +2676,17 @@ static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data, { unsigned long flags = HIST_FIELD_FL_VAR_REF; struct hist_field *ref_field; + int i; + + /* Check if the variable already exists */ + for (i = 0; i < hist_data->n_var_refs; i++) { + ref_field = hist_data->var_refs[i]; + if (ref_field->var.idx == var_field->var.idx && + ref_field->var.hist_data == var_field->hist_data) { + get_hist_field(ref_field); + return ref_field; + } + } ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL); if (ref_field) { -- cgit v1.2.3 From bf24daac8f2bd5b8affaec03c2be1d20bcdd6837 Mon Sep 17 00:00:00 2001 From: Masami Ichikawa Date: Thu, 16 Jan 2020 22:12:36 +0900 Subject: tracing: Do not set trace clock if tracefs lockdown is in effect When trace_clock option is not set and unstable clcok detected, tracing_set_default_clock() sets trace_clock(ThinkPad A285 is one of case). In that case, if lockdown is in effect, null pointer dereference error happens in ring_buffer_set_clock(). Link: http://lkml.kernel.org/r/20200116131236.3866925-1-masami256@gmail.com Cc: stable@vger.kernel.org Fixes: 17911ff38aa58 ("tracing: Add locked_down checks to the open calls of files created for tracefs") Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1788488 Signed-off-by: Masami Ichikawa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ddb7e7f5fe8d..5b6ee4aadc26 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9420,6 +9420,11 @@ __init static int tracing_set_default_clock(void) { /* sched_clock_stable() is determined in late_initcall */ if (!trace_boot_clock && !sched_clock_stable()) { + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warn("Can not set tracing clock due to lockdown\n"); + return -EPERM; + } + printk(KERN_WARNING "Unstable clock detected, switching default tracing clock to \"global\"\n" "If you want to keep using the local clock, then add:\n" -- cgit v1.2.3 From 2e3a94aa2bfc6de95a0700f0a868c6f5db3a9592 Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Sun, 19 Jan 2020 11:40:40 -0800 Subject: bpf: Fix memory leaks in generic update/delete batch ops Generic update/delete batch ops functions were using __bpf_copy_key without properly freeing the memory. Handle the memory allocation and copy_from_user separately. Fixes: aa2e93b8e58e ("bpf: Add generic support for update and delete batch ops") Reported-by: Dan Carpenter Signed-off-by: Brian Vazquez Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200119194040.128369-1-brianvv@google.com --- kernel/bpf/syscall.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c26a71460f02..9a840c57f6df 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1239,12 +1239,15 @@ int generic_map_delete_batch(struct bpf_map *map, if (!max_count) return 0; + key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + if (!key) + return -ENOMEM; + for (cp = 0; cp < max_count; cp++) { - key = __bpf_copy_key(keys + cp * map->key_size, map->key_size); - if (IS_ERR(key)) { - err = PTR_ERR(key); + err = -EFAULT; + if (copy_from_user(key, keys + cp * map->key_size, + map->key_size)) break; - } if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_delete_elem(map, key); @@ -1264,6 +1267,8 @@ int generic_map_delete_batch(struct bpf_map *map, } if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) err = -EFAULT; + + kfree(key); return err; } @@ -1294,18 +1299,21 @@ int generic_map_update_batch(struct bpf_map *map, if (!max_count) return 0; + key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + if (!key) + return -ENOMEM; + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); - if (!value) + if (!value) { + kfree(key); return -ENOMEM; + } for (cp = 0; cp < max_count; cp++) { - key = __bpf_copy_key(keys + cp * map->key_size, map->key_size); - if (IS_ERR(key)) { - err = PTR_ERR(key); - break; - } err = -EFAULT; - if (copy_from_user(value, values + cp * value_size, value_size)) + if (copy_from_user(key, keys + cp * map->key_size, + map->key_size) || + copy_from_user(value, values + cp * value_size, value_size)) break; err = bpf_map_update_value(map, f, key, value, -- cgit v1.2.3 From 9a09cd74e7dc6c2ac34b39ea6e74440ceb4c501e Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Tue, 21 Jan 2020 13:50:07 +0800 Subject: ftrace: Remove abandoned macros These 2 macros aren't used from commit eee8ded131f1 ("ftrace: Have the function probes call their own function"), so remove them. Link: http://lkml.kernel.org/r/1579585807-43316-1-git-send-email-alex.shi@linux.alibaba.com Signed-off-by: Alex Shi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3f0ae07e72ef..7fe87c7ab1a8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -62,8 +62,6 @@ }) /* hash bits for specific function selection */ -#define FTRACE_HASH_BITS 7 -#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) #define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 -- cgit v1.2.3 From aff4866db56e5cc5601dcd896e056160e07ca361 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Tue, 21 Jan 2020 13:54:23 +0800 Subject: ftrace: Remove NR_TO_INIT macro This macro isn't used from commit cb7be3b2fc2c ("ftrace: remove daemon"). So no needs to keep it. Link: http://lkml.kernel.org/r/1579586063-44984-1-git-send-email-alex.shi@linux.alibaba.com Signed-off-by: Alex Shi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7fe87c7ab1a8..5c701765da5b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1101,9 +1101,6 @@ struct ftrace_page { #define ENTRY_SIZE sizeof(struct dyn_ftrace) #define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) -/* estimate from running different kernels */ -#define NR_TO_INIT 10000 - static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -- cgit v1.2.3 From b87121dd3fa0a4d2636c0ad02a3b3fc5161fa26b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jan 2020 23:28:58 +0000 Subject: bpf: don't bother with getname/kern_path - use user_path_at kernel/bpf/inode.c misuses kern_path...() - it's much simpler (and more efficient, on top of that) to use user_path...() counterparts rather than bothering with doing getname() manually. Signed-off-by: Al Viro Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200120232858.GF8904@ZenIV.linux.org.uk --- kernel/bpf/inode.c | 43 +++++++++++++------------------------------ 1 file changed, 13 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index ecf42bec38c0..e11059b99f18 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -380,7 +380,7 @@ static const struct inode_operations bpf_dir_iops = { .unlink = simple_unlink, }; -static int bpf_obj_do_pin(const struct filename *pathname, void *raw, +static int bpf_obj_do_pin(const char __user *pathname, void *raw, enum bpf_type type) { struct dentry *dentry; @@ -389,7 +389,7 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, umode_t mode; int ret; - dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0); + dentry = user_path_create(AT_FDCWD, pathname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -422,30 +422,22 @@ out: int bpf_obj_pin_user(u32 ufd, const char __user *pathname) { - struct filename *pname; enum bpf_type type; void *raw; int ret; - pname = getname(pathname); - if (IS_ERR(pname)) - return PTR_ERR(pname); - raw = bpf_fd_probe_obj(ufd, &type); - if (IS_ERR(raw)) { - ret = PTR_ERR(raw); - goto out; - } + if (IS_ERR(raw)) + return PTR_ERR(raw); - ret = bpf_obj_do_pin(pname, raw, type); + ret = bpf_obj_do_pin(pathname, raw, type); if (ret != 0) bpf_any_put(raw, type); -out: - putname(pname); + return ret; } -static void *bpf_obj_do_get(const struct filename *pathname, +static void *bpf_obj_do_get(const char __user *pathname, enum bpf_type *type, int flags) { struct inode *inode; @@ -453,7 +445,7 @@ static void *bpf_obj_do_get(const struct filename *pathname, void *raw; int ret; - ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path); + ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path); if (ret) return ERR_PTR(ret); @@ -480,36 +472,27 @@ out: int bpf_obj_get_user(const char __user *pathname, int flags) { enum bpf_type type = BPF_TYPE_UNSPEC; - struct filename *pname; - int ret = -ENOENT; int f_flags; void *raw; + int ret; f_flags = bpf_get_file_flag(flags); if (f_flags < 0) return f_flags; - pname = getname(pathname); - if (IS_ERR(pname)) - return PTR_ERR(pname); - - raw = bpf_obj_do_get(pname, &type, f_flags); - if (IS_ERR(raw)) { - ret = PTR_ERR(raw); - goto out; - } + raw = bpf_obj_do_get(pathname, &type, f_flags); + if (IS_ERR(raw)) + return PTR_ERR(raw); if (type == BPF_TYPE_PROG) ret = bpf_prog_new_fd(raw); else if (type == BPF_TYPE_MAP) ret = bpf_map_new_fd(raw, f_flags); else - goto out; + return -ENOENT; if (ret < 0) bpf_any_put(raw, type); -out: - putname(pname); return ret; } -- cgit v1.2.3 From b83479482ff6f856b1308a17768f228be779543a Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Tue, 21 Jan 2020 13:54:40 +0800 Subject: ring-buffer: Remove abandoned macro RB_MISSED_FLAGS This macro isn't used since commit d325c402964e ("ring-buffer: Remove unused function ring_buffer_page_len()"), so better to remove it. Link: http://lkml.kernel.org/r/1579586080-45300-1-git-send-email-alex.shi@linux.alibaba.com Signed-off-by: Alex Shi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3bab9b0a90b6..61f0e92ace99 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -300,8 +300,6 @@ u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event) /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) -#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED) - struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ -- cgit v1.2.3 From 141597204ea2dc173668c8cd7202c4bac2b0c476 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Tue, 21 Jan 2020 13:54:46 +0800 Subject: tracing: Remove unused TRACE_SEQ_BUF_USED This macro isn't used from commit 3a161d99c43c ("tracing: Create seq_buf layer in trace_seq"). so no needs to keep it. Link: http://lkml.kernel.org/r/1579586086-45543-1-git-send-email-alex.shi@linux.alibaba.com Signed-off-by: Alex Shi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_seq.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 87de6edafd14..1d84fcc78e3e 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -30,9 +30,6 @@ /* How much buffer is left on the trace_seq? */ #define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq) -/* How much buffer is written? */ -#define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq) - /* * trace_seq should work with being initialized with 0s. */ -- cgit v1.2.3 From 532f49a6f19a153e202b5a174f8556fd50c36dd4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 17 Jan 2020 08:30:07 +0300 Subject: tracing/boot: Fix an IS_ERR() vs NULL bug The trace_array_get_by_name() function doesn't return error pointers, it returns NULL on error. Link: http://lkml.kernel.org/r/20200117053007.5h2juv272pokqhtq@kili.mountain Fixes: 4f712a4d04a4 ("tracing/boot: Add instance node support") Acked-by: Masami Hiramatsu Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index fa9603dc6469..cd541ac1cbc1 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -322,7 +322,7 @@ trace_boot_init_instances(struct xbc_node *node) continue; tr = trace_array_get_by_name(p); - if (IS_ERR(tr)) { + if (!tr) { pr_err("Failed to get trace instance %s\n", p); continue; } -- cgit v1.2.3 From 05d57f1793fb250c85028c9952c3720010baa853 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 20 Jan 2020 19:22:31 -0800 Subject: bpf: Fix trampoline usage in preempt Though the second half of trampoline page is unused a task could be preempted in the middle of the first half of trampoline and two updates to trampoline would change the code from underneath the preempted task. Hence wait for tasks to voluntarily schedule or go to userspace. Add similar wait before freeing the trampoline. Fixes: fec56f5890d9 ("bpf: Introduce BPF trampoline") Reported-by: Jann Horn Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Paul E. McKenney Link: https://lore.kernel.org/bpf/20200121032231.3292185-1-ast@kernel.org --- kernel/bpf/trampoline.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 79a04417050d..7657ede7aee2 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -160,6 +160,14 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) if (fexit_cnt) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; + /* Though the second half of trampoline page is unused a task could be + * preempted in the middle of the first half of trampoline and two + * updates to trampoline would change the code from underneath the + * preempted task. Hence wait for tasks to voluntarily schedule or go + * to userspace. + */ + synchronize_rcu_tasks(); + err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, &tr->func.model, flags, fentry, fentry_cnt, @@ -251,6 +259,8 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) goto out; if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; + /* wait for tasks to get out of trampoline before freeing it */ + synchronize_rcu_tasks(); bpf_jit_free_exec(tr->image); hlist_del(&tr->hlist); kfree(tr); -- cgit v1.2.3 From f59bbfc2f6099e8655f9e8f585e10ffde17176d0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 Jan 2020 18:41:38 -0800 Subject: bpf: Fix error path under memory pressure Restore the 'if (env->cur_state)' check that was incorrectly removed during code move. Under memory pressure env->cur_state can be freed and zeroed inside do_check(). Hence the check is necessary. Fixes: 51c39bb1d5d1 ("bpf: Introduce function-by-function verification") Reported-by: syzbot+b296579ba5015704d9fa@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200122024138.3385590-1-ast@kernel.org --- kernel/bpf/verifier.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ca17dccc17ba..6defbec9eb62 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9594,8 +9594,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) ret = do_check(env); out: - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; + /* check for NULL is necessary, since cur_state can be freed inside + * do_check() under memory pressure. + */ + if (env->cur_state) { + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + } while (!pop_stack(env, NULL, NULL)); free_states(env); if (ret) -- cgit v1.2.3 From 34423f250a372d71346922edf2b84a19d811a311 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 22 Jan 2020 06:44:50 -0500 Subject: tracing: Fix uninitialized buffer var on early exit to trace_vbprintk() If we exit due to a bad input to trace_printk() (highly unlikely), then the buffer variable will not be initialized when we unnest the ring buffer. Reported-by: kbuild test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2e1db19dce97..d1410b4462ac 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3230,7 +3230,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) - goto out; + goto out_put; local_save_flags(flags); size = sizeof(*entry) + sizeof(u32) * len; @@ -3252,6 +3252,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) out: ring_buffer_nest_end(buffer); +out_put: put_trace_buf(); out_nobuffer: -- cgit v1.2.3 From 659ded30272d67a04b3692f0bfa12263be20d790 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Tue, 21 Jan 2020 13:54:35 +0800 Subject: trace/kprobe: Remove unused MAX_KPROBE_CMDLINE_SIZE This limitation are never lunched from introduce commit 970988e19eb0 ("tracing/kprobe: Add kprobe_event= boot parameter") Could we remove it if no intention to implement it? Link: http://lkml.kernel.org/r/1579586075-45132-1-git-send-email-alex.shi@linux.alibaba.com Acked-by: Masami Hiramatsu Signed-off-by: Alex Shi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 283b7c437440..bf20cd7f2666 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -22,7 +22,6 @@ #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 -#define MAX_KPROBE_CMDLINE_SIZE 1024 /* Kprobe early definition from command line */ static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata; -- cgit v1.2.3 From b61387cb732cf283d318b2165c44913525fe545f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 22 Jan 2020 12:23:25 +0900 Subject: tracing/uprobe: Fix to make trace_uprobe_filter alignment safe Commit 99c9a923e97a ("tracing/uprobe: Fix double perf_event linking on multiprobe uprobe") moved trace_uprobe_filter on trace_probe_event. However, since it introduced a flexible data structure with char array and type casting, the alignment of trace_uprobe_filter can be broken. This changes the type of the array to trace_uprobe_filter data strucure to fix it. Link: http://lore.kernel.org/r/20200120124022.GA14897@hirez.programming.kicks-ass.net Link: http://lkml.kernel.org/r/157966340499.5107.10978352478952144902.stgit@devnote2 Fixes: 99c9a923e97a ("tracing/uprobe: Fix double perf_event linking on multiprobe uprobe") Suggested-by: Peter Zijlstra Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_probe.c | 9 ++++++--- kernel/trace/trace_probe.h | 10 ++++++++-- kernel/trace/trace_uprobe.c | 29 +++++++---------------------- 4 files changed, 22 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3e5f9c7d939c..3f54dc2f6e1c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -290,7 +290,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, INIT_HLIST_NODE(&tk->rp.kp.hlist); INIT_LIST_HEAD(&tk->rp.kp.list); - ret = trace_probe_init(&tk->tp, event, group, 0); + ret = trace_probe_init(&tk->tp, event, group, false); if (ret < 0) goto error; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index bba18cf44a30..9ae87be422f2 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -984,16 +984,19 @@ void trace_probe_cleanup(struct trace_probe *tp) } int trace_probe_init(struct trace_probe *tp, const char *event, - const char *group, size_t event_data_size) + const char *group, bool alloc_filter) { struct trace_event_call *call; + size_t size = sizeof(struct trace_probe_event); int ret = 0; if (!event || !group) return -EINVAL; - tp->event = kzalloc(sizeof(struct trace_probe_event) + event_data_size, - GFP_KERNEL); + if (alloc_filter) + size += sizeof(struct trace_uprobe_filter); + + tp->event = kzalloc(size, GFP_KERNEL); if (!tp->event) return -ENOMEM; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 03e4e180058d..a0ff9e200ef6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -223,6 +223,12 @@ struct probe_arg { const struct fetch_type *type; /* Type of this argument */ }; +struct trace_uprobe_filter { + rwlock_t rwlock; + int nr_systemwide; + struct list_head perf_events; +}; + /* Event call and class holder */ struct trace_probe_event { unsigned int flags; /* For TP_FLAG_* */ @@ -230,7 +236,7 @@ struct trace_probe_event { struct trace_event_call call; struct list_head files; struct list_head probes; - char data[0]; + struct trace_uprobe_filter filter[0]; }; struct trace_probe { @@ -323,7 +329,7 @@ static inline bool trace_probe_has_single_file(struct trace_probe *tp) } int trace_probe_init(struct trace_probe *tp, const char *event, - const char *group, size_t event_data_size); + const char *group, bool alloc_filter); void trace_probe_cleanup(struct trace_probe *tp); int trace_probe_append(struct trace_probe *tp, struct trace_probe *to); void trace_probe_unlink(struct trace_probe *tp); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f66e202fec13..2619bc5ed520 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -34,12 +34,6 @@ struct uprobe_trace_entry_head { #define DATAOF_TRACE_ENTRY(entry, is_return) \ ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return)) -struct trace_uprobe_filter { - rwlock_t rwlock; - int nr_systemwide; - struct list_head perf_events; -}; - static int trace_uprobe_create(int argc, const char **argv); static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev); static int trace_uprobe_release(struct dyn_event *ev); @@ -263,14 +257,6 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, } NOKPROBE_SYMBOL(process_fetch_insn) -static struct trace_uprobe_filter * -trace_uprobe_get_filter(struct trace_uprobe *tu) -{ - struct trace_probe_event *event = tu->tp.event; - - return (struct trace_uprobe_filter *)&event->data[0]; -} - static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) { rwlock_init(&filter->rwlock); @@ -358,8 +344,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (!tu) return ERR_PTR(-ENOMEM); - ret = trace_probe_init(&tu->tp, event, group, - sizeof(struct trace_uprobe_filter)); + ret = trace_probe_init(&tu->tp, event, group, true); if (ret < 0) goto error; @@ -367,7 +352,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) tu->consumer.handler = uprobe_dispatcher; if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; - init_trace_uprobe_filter(trace_uprobe_get_filter(tu)); + init_trace_uprobe_filter(tu->tp.event->filter); return tu; error: @@ -1076,7 +1061,7 @@ static void __probe_event_disable(struct trace_probe *tp) struct trace_uprobe *tu; tu = container_of(tp, struct trace_uprobe, tp); - WARN_ON(!uprobe_filter_is_empty(trace_uprobe_get_filter(tu))); + WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); list_for_each_entry(pos, trace_probe_probe_list(tp), list) { tu = container_of(pos, struct trace_uprobe, tp); @@ -1117,7 +1102,7 @@ static int probe_event_enable(struct trace_event_call *call, } tu = container_of(tp, struct trace_uprobe, tp); - WARN_ON(!uprobe_filter_is_empty(trace_uprobe_get_filter(tu))); + WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); if (enabled) return 0; @@ -1281,7 +1266,7 @@ static int uprobe_perf_close(struct trace_event_call *call, return -ENODEV; tu = container_of(tp, struct trace_uprobe, tp); - if (trace_uprobe_filter_remove(trace_uprobe_get_filter(tu), event)) + if (trace_uprobe_filter_remove(tu->tp.event->filter, event)) return 0; list_for_each_entry(pos, trace_probe_probe_list(tp), list) { @@ -1306,7 +1291,7 @@ static int uprobe_perf_open(struct trace_event_call *call, return -ENODEV; tu = container_of(tp, struct trace_uprobe, tp); - if (trace_uprobe_filter_add(trace_uprobe_get_filter(tu), event)) + if (trace_uprobe_filter_add(tu->tp.event->filter, event)) return 0; list_for_each_entry(pos, trace_probe_probe_list(tp), list) { @@ -1328,7 +1313,7 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, int ret; tu = container_of(uc, struct trace_uprobe, consumer); - filter = trace_uprobe_get_filter(tu); + filter = tu->tp.event->filter; read_lock(&filter->rwlock); ret = __uprobe_perf_filter(filter, mm); -- cgit v1.2.3 From eb5a4d0a9ee976008d1add75e3d64545399e80a3 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 20 Jan 2020 22:43:47 +0000 Subject: hrtimer: Add missing sparse annotation for __run_timer() Sparse reports a warning at __run_hrtimer() |warning: context imbalance in __run_hrtimer() - unexpected unlock Add the missing must_hold() annotation. Signed-off-by: Jules Irenge Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200120224347.51843-1-jbi.octave@gmail.com --- kernel/time/hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d8b62f93fc8d..3a609e7344f3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1477,7 +1477,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active); static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, struct hrtimer *timer, ktime_t *now, - unsigned long flags) + unsigned long flags) __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); int restart; -- cgit v1.2.3 From 11ea68f553e244851d15793a7fa33a97c46d8271 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 20 Jan 2020 17:16:25 +0800 Subject: genirq, sched/isolation: Isolate from handling managed interrupts The affinity of managed interrupts is completely handled in the kernel and cannot be changed via the /proc/irq/* interfaces from user space. As the kernel tries to spread out interrupts evenly accross CPUs on x86 to prevent vector exhaustion, it can happen that a managed interrupt whose affinity mask contains both isolated and housekeeping CPUs is routed to an isolated CPU. As a consequence IO submitted on a housekeeping CPU causes interrupts on the isolated CPU. Add a new sub-parameter 'managed_irq' for 'isolcpus' and the corresponding logic in the interrupt affinity selection code. The subparameter indicates to the interrupt affinity selection logic that it should try to avoid the above scenario. This isolation is best effort and only effective if the automatically assigned interrupt mask of a device queue contains isolated and housekeeping CPUs. If housekeeping CPUs are online then such interrupts are directed to the housekeeping CPU so that IO submitted on the housekeeping CPU cannot disturb the isolated CPU. If a queue's affinity mask contains only isolated CPUs then this parameter has no effect on the interrupt routing decision, though interrupts are only happening when tasks running on those isolated CPUs submit IO. IO submitted on housekeeping CPUs has no influence on those queues. If the affinity mask contains both housekeeping and isolated CPUs, but none of the contained housekeeping CPUs is online, then the interrupt is also routed to an isolated CPU. Interrupts are only delivered when one of the isolated CPUs in the affinity mask submits IO. If one of the contained housekeeping CPUs comes online, the CPU hotplug logic migrates the interrupt automatically back to the upcoming housekeeping CPU. Depending on the type of interrupt controller, this can require that at least one interrupt is delivered to the isolated CPU in order to complete the migration. [ tglx: Removed unused parameter, added and edited comments/documentation and rephrased the changelog so it contains more details. ] Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200120091625.17912-1-ming.lei@redhat.com --- kernel/irq/cpuhotplug.c | 21 +++++++++++++++++++-- kernel/irq/manage.c | 41 ++++++++++++++++++++++++++++++++++++++++- kernel/sched/isolation.c | 6 ++++++ 3 files changed, 65 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 6c7ca2e983a5..02236b13b359 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "internals.h" @@ -171,6 +172,20 @@ void irq_migrate_all_off_this_cpu(void) } } +static bool hk_should_isolate(struct irq_data *data, unsigned int cpu) +{ + const struct cpumask *hk_mask; + + if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) + return false; + + hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ); + if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask)) + return false; + + return cpumask_test_cpu(cpu, hk_mask); +} + static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) { struct irq_data *data = irq_desc_get_irq_data(desc); @@ -188,9 +203,11 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) /* * If the interrupt can only be directed to a single target * CPU then it is already assigned to a CPU in the affinity - * mask. No point in trying to move it around. + * mask. No point in trying to move it around unless the + * isolation mechanism requests to move it to an upcoming + * housekeeping CPU. */ - if (!irqd_is_single_target(data)) + if (!irqd_is_single_target(data) || hk_should_isolate(data, cpu)) irq_set_affinity_locked(data, affinity, false); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b6c53ab053d2..818b2802d3e7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -217,7 +218,45 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, if (!chip || !chip->irq_set_affinity) return -EINVAL; - ret = chip->irq_set_affinity(data, mask, force); + /* + * If this is a managed interrupt and housekeeping is enabled on + * it check whether the requested affinity mask intersects with + * a housekeeping CPU. If so, then remove the isolated CPUs from + * the mask and just keep the housekeeping CPU(s). This prevents + * the affinity setter from routing the interrupt to an isolated + * CPU to avoid that I/O submitted from a housekeeping CPU causes + * interrupts on an isolated one. + * + * If the masks do not intersect or include online CPU(s) then + * keep the requested mask. The isolated target CPUs are only + * receiving interrupts when the I/O operation was submitted + * directly from them. + * + * If all housekeeping CPUs in the affinity mask are offline, the + * interrupt will be migrated by the CPU hotplug code once a + * housekeeping CPU which belongs to the affinity mask comes + * online. + */ + if (irqd_affinity_is_managed(data) && + housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) { + const struct cpumask *hk_mask, *prog_mask; + + static DEFINE_RAW_SPINLOCK(tmp_mask_lock); + static struct cpumask tmp_mask; + + hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ); + + raw_spin_lock(&tmp_mask_lock); + cpumask_and(&tmp_mask, mask, hk_mask); + if (!cpumask_intersects(&tmp_mask, cpu_online_mask)) + prog_mask = mask; + else + prog_mask = &tmp_mask; + ret = chip->irq_set_affinity(data, prog_mask, force); + raw_spin_unlock(&tmp_mask_lock); + } else { + ret = chip->irq_set_affinity(data, mask, force); + } switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 9fcb2a695a41..008d6ac2342b 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -163,6 +163,12 @@ static int __init housekeeping_isolcpus_setup(char *str) continue; } + if (!strncmp(str, "managed_irq,", 12)) { + str += 12; + flags |= HK_FLAG_MANAGED_IRQ; + continue; + } + pr_warn("isolcpus: Error, unknown flag\n"); return 0; } -- cgit v1.2.3 From be8704ff07d2374bcc5c675526f95e70c6459683 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 20 Jan 2020 16:53:46 -0800 Subject: bpf: Introduce dynamic program extensions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce dynamic program extensions. The users can load additional BPF functions and replace global functions in previously loaded BPF programs while these programs are executing. Global functions are verified individually by the verifier based on their types only. Hence the global function in the new program which types match older function can safely replace that corresponding function. This new function/program is called 'an extension' of old program. At load time the verifier uses (attach_prog_fd, attach_btf_id) pair to identify the function to be replaced. The BPF program type is derived from the target program into extension program. Technically bpf_verifier_ops is copied from target program. The BPF_PROG_TYPE_EXT program type is a placeholder. It has empty verifier_ops. The extension program can call the same bpf helper functions as target program. Single BPF_PROG_TYPE_EXT type is used to extend XDP, SKB and all other program types. The verifier allows only one level of replacement. Meaning that the extension program cannot recursively extend an extension. That also means that the maximum stack size is increasing from 512 to 1024 bytes and maximum function nesting level from 8 to 16. The programs don't always consume that much. The stack usage is determined by the number of on-stack variables used by the program. The verifier could have enforced 512 limit for combined original plus extension program, but it makes for difficult user experience. The main use case for extensions is to provide generic mechanism to plug external programs into policy program or function call chaining. BPF trampoline is used to track both fentry/fexit and program extensions because both are using the same nop slot at the beginning of every BPF function. Attaching fentry/fexit to a function that was replaced is not allowed. The opposite is true as well. Replacing a function that currently being analyzed with fentry/fexit is not allowed. The executable page allocated by BPF trampoline is not used by program extensions. This inefficiency will be optimized in future patches. Function by function verification of global function supports scalars and pointer to context only. Hence program extensions are supported for such class of global functions only. In the future the verifier will be extended with support to pointers to structures, arrays with sizes, etc. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200121005348.2769920-2-ast@kernel.org --- kernel/bpf/btf.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 15 ++++- kernel/bpf/trampoline.c | 41 ++++++++++++- kernel/bpf/verifier.c | 85 ++++++++++++++++++++------- 4 files changed, 266 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 832b5d7fd892..32963b6d5a9c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -276,6 +276,11 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_DATASEC] = "DATASEC", }; +static const char *btf_type_str(const struct btf_type *t) +{ + return btf_kind_str[BTF_INFO_KIND(t->info)]; +} + struct btf_kind_operations { s32 (*check_meta)(struct btf_verifier_env *env, const struct btf_type *t, @@ -4115,6 +4120,148 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return 0; } +/* Compare BTFs of two functions assuming only scalars and pointers to context. + * t1 points to BTF_KIND_FUNC in btf1 + * t2 points to BTF_KIND_FUNC in btf2 + * Returns: + * EINVAL - function prototype mismatch + * EFAULT - verifier bug + * 0 - 99% match. The last 1% is validated by the verifier. + */ +int btf_check_func_type_match(struct bpf_verifier_log *log, + struct btf *btf1, const struct btf_type *t1, + struct btf *btf2, const struct btf_type *t2) +{ + const struct btf_param *args1, *args2; + const char *fn1, *fn2, *s1, *s2; + u32 nargs1, nargs2, i; + + fn1 = btf_name_by_offset(btf1, t1->name_off); + fn2 = btf_name_by_offset(btf2, t2->name_off); + + if (btf_func_linkage(t1) != BTF_FUNC_GLOBAL) { + bpf_log(log, "%s() is not a global function\n", fn1); + return -EINVAL; + } + if (btf_func_linkage(t2) != BTF_FUNC_GLOBAL) { + bpf_log(log, "%s() is not a global function\n", fn2); + return -EINVAL; + } + + t1 = btf_type_by_id(btf1, t1->type); + if (!t1 || !btf_type_is_func_proto(t1)) + return -EFAULT; + t2 = btf_type_by_id(btf2, t2->type); + if (!t2 || !btf_type_is_func_proto(t2)) + return -EFAULT; + + args1 = (const struct btf_param *)(t1 + 1); + nargs1 = btf_type_vlen(t1); + args2 = (const struct btf_param *)(t2 + 1); + nargs2 = btf_type_vlen(t2); + + if (nargs1 != nargs2) { + bpf_log(log, "%s() has %d args while %s() has %d args\n", + fn1, nargs1, fn2, nargs2); + return -EINVAL; + } + + t1 = btf_type_skip_modifiers(btf1, t1->type, NULL); + t2 = btf_type_skip_modifiers(btf2, t2->type, NULL); + if (t1->info != t2->info) { + bpf_log(log, + "Return type %s of %s() doesn't match type %s of %s()\n", + btf_type_str(t1), fn1, + btf_type_str(t2), fn2); + return -EINVAL; + } + + for (i = 0; i < nargs1; i++) { + t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL); + t2 = btf_type_skip_modifiers(btf2, args2[i].type, NULL); + + if (t1->info != t2->info) { + bpf_log(log, "arg%d in %s() is %s while %s() has %s\n", + i, fn1, btf_type_str(t1), + fn2, btf_type_str(t2)); + return -EINVAL; + } + if (btf_type_has_size(t1) && t1->size != t2->size) { + bpf_log(log, + "arg%d in %s() has size %d while %s() has %d\n", + i, fn1, t1->size, + fn2, t2->size); + return -EINVAL; + } + + /* global functions are validated with scalars and pointers + * to context only. And only global functions can be replaced. + * Hence type check only those types. + */ + if (btf_type_is_int(t1) || btf_type_is_enum(t1)) + continue; + if (!btf_type_is_ptr(t1)) { + bpf_log(log, + "arg%d in %s() has unrecognized type\n", + i, fn1); + return -EINVAL; + } + t1 = btf_type_skip_modifiers(btf1, t1->type, NULL); + t2 = btf_type_skip_modifiers(btf2, t2->type, NULL); + if (!btf_type_is_struct(t1)) { + bpf_log(log, + "arg%d in %s() is not a pointer to context\n", + i, fn1); + return -EINVAL; + } + if (!btf_type_is_struct(t2)) { + bpf_log(log, + "arg%d in %s() is not a pointer to context\n", + i, fn2); + return -EINVAL; + } + /* This is an optional check to make program writing easier. + * Compare names of structs and report an error to the user. + * btf_prepare_func_args() already checked that t2 struct + * is a context type. btf_prepare_func_args() will check + * later that t1 struct is a context type as well. + */ + s1 = btf_name_by_offset(btf1, t1->name_off); + s2 = btf_name_by_offset(btf2, t2->name_off); + if (strcmp(s1, s2)) { + bpf_log(log, + "arg%d %s(struct %s *) doesn't match %s(struct %s *)\n", + i, fn1, s1, fn2, s2); + return -EINVAL; + } + } + return 0; +} + +/* Compare BTFs of given program with BTF of target program */ +int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, + struct btf *btf2, const struct btf_type *t2) +{ + struct btf *btf1 = prog->aux->btf; + const struct btf_type *t1; + u32 btf_id = 0; + + if (!prog->aux->func_info) { + bpf_log(&env->log, "Program extension requires BTF\n"); + return -EINVAL; + } + + btf_id = prog->aux->func_info[0].type_id; + if (!btf_id) + return -EFAULT; + + t1 = btf_type_by_id(btf1, btf_id); + if (!t1 || !btf_type_is_func(t1)) + return -EFAULT; + + return btf_check_func_type_match(&env->log, btf1, t1, btf2, t2); +} + /* Compare BTF of a function with given bpf_reg_state. * Returns: * EFAULT - there is a verifier bug. Abort verification. @@ -4224,6 +4371,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, { struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; + enum bpf_prog_type prog_type = prog->type; struct btf *btf = prog->aux->btf; const struct btf_param *args; const struct btf_type *t; @@ -4261,6 +4409,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, bpf_log(log, "Verifier bug in function %s()\n", tname); return -EFAULT; } + if (prog_type == BPF_PROG_TYPE_EXT) + prog_type = prog->aux->linked_prog->type; t = btf_type_by_id(btf, t->type); if (!t || !btf_type_is_func_proto(t)) { @@ -4296,7 +4446,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, continue; } if (btf_type_is_ptr(t) && - btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { + btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { reg[i + 1].type = PTR_TO_CTX; continue; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9a840c57f6df..a91ad518c050 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1932,13 +1932,15 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, switch (prog_type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_STRUCT_OPS: + case BPF_PROG_TYPE_EXT: break; default: return -EINVAL; } } - if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING) + if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING && + prog_type != BPF_PROG_TYPE_EXT) return -EINVAL; switch (prog_type) { @@ -1981,6 +1983,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_EXT: + if (expected_attach_type) + return -EINVAL; + /* fallthrough */ default: return 0; } @@ -2183,7 +2189,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) int tr_fd, err; if (prog->expected_attach_type != BPF_TRACE_FENTRY && - prog->expected_attach_type != BPF_TRACE_FEXIT) { + prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->type != BPF_PROG_TYPE_EXT) { err = -EINVAL; goto out_put_prog; } @@ -2250,12 +2257,14 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_EXT && prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { err = -EINVAL; goto out_put_prog; } - if (prog->type == BPF_PROG_TYPE_TRACING) { + if (prog->type == BPF_PROG_TYPE_TRACING || + prog->type == BPF_PROG_TYPE_EXT) { if (attr->raw_tracepoint.name) { /* The attach point for this category of programs * should be specified via btf_id during program load. diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 7657ede7aee2..eb64c245052b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -5,6 +5,12 @@ #include #include +/* dummy _ops. The verifier will operate on target program's ops. */ +const struct bpf_verifier_ops bpf_extension_verifier_ops = { +}; +const struct bpf_prog_ops bpf_extension_prog_ops = { +}; + /* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */ #define TRAMPOLINE_HASH_BITS 10 #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) @@ -194,8 +200,10 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t) switch (t) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; - default: + case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; + default: + return BPF_TRAMP_REPLACE; } } @@ -204,12 +212,31 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog) enum bpf_tramp_prog_type kind; struct bpf_trampoline *tr; int err = 0; + int cnt; tr = prog->aux->trampoline; kind = bpf_attach_type_to_tramp(prog->expected_attach_type); mutex_lock(&tr->mutex); - if (tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT] - >= BPF_MAX_TRAMP_PROGS) { + if (tr->extension_prog) { + /* cannot attach fentry/fexit if extension prog is attached. + * cannot overwrite extension prog either. + */ + err = -EBUSY; + goto out; + } + cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]; + if (kind == BPF_TRAMP_REPLACE) { + /* Cannot attach extension if fentry/fexit are in use. */ + if (cnt) { + err = -EBUSY; + goto out; + } + tr->extension_prog = prog; + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, + prog->bpf_func); + goto out; + } + if (cnt >= BPF_MAX_TRAMP_PROGS) { err = -E2BIG; goto out; } @@ -240,9 +267,17 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog) tr = prog->aux->trampoline; kind = bpf_attach_type_to_tramp(prog->expected_attach_type); mutex_lock(&tr->mutex); + if (kind == BPF_TRAMP_REPLACE) { + WARN_ON_ONCE(!tr->extension_prog); + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, + tr->extension_prog->bpf_func, NULL); + tr->extension_prog = NULL; + goto out; + } hlist_del(&prog->aux->tramp_hlist); tr->progs_cnt[kind]--; err = bpf_trampoline_update(prog->aux->trampoline); +out: mutex_unlock(&tr->mutex); return err; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6defbec9eb62..9fe94f64f0ec 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9564,7 +9564,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) subprog); regs = state->frame[state->curframe]->regs; - if (subprog) { + if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { ret = btf_prepare_func_args(env, subprog, regs); if (ret) goto out; @@ -9742,6 +9742,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; + bool prog_extension = prog->type == BPF_PROG_TYPE_EXT; struct bpf_prog *tgt_prog = prog->aux->linked_prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; @@ -9757,7 +9758,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) return check_struct_ops_btf_id(env); - if (prog->type != BPF_PROG_TYPE_TRACING) + if (prog->type != BPF_PROG_TYPE_TRACING && !prog_extension) return 0; if (!btf_id) { @@ -9793,8 +9794,59 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } conservative = aux->func_info_aux[subprog].unreliable; + if (prog_extension) { + if (conservative) { + verbose(env, + "Cannot replace static functions\n"); + return -EINVAL; + } + if (!prog->jit_requested) { + verbose(env, + "Extension programs should be JITed\n"); + return -EINVAL; + } + env->ops = bpf_verifier_ops[tgt_prog->type]; + } + if (!tgt_prog->jited) { + verbose(env, "Can attach to only JITed progs\n"); + return -EINVAL; + } + if (tgt_prog->type == prog->type) { + /* Cannot fentry/fexit another fentry/fexit program. + * Cannot attach program extension to another extension. + * It's ok to attach fentry/fexit to extension program. + */ + verbose(env, "Cannot recursively attach\n"); + return -EINVAL; + } + if (tgt_prog->type == BPF_PROG_TYPE_TRACING && + prog_extension && + (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY || + tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) { + /* Program extensions can extend all program types + * except fentry/fexit. The reason is the following. + * The fentry/fexit programs are used for performance + * analysis, stats and can be attached to any program + * type except themselves. When extension program is + * replacing XDP function it is necessary to allow + * performance analysis of all functions. Both original + * XDP program and its program extension. Hence + * attaching fentry/fexit to BPF_PROG_TYPE_EXT is + * allowed. If extending of fentry/fexit was allowed it + * would be possible to create long call chain + * fentry->extension->fentry->extension beyond + * reasonable stack size. Hence extending fentry is not + * allowed. + */ + verbose(env, "Cannot extend fentry/fexit\n"); + return -EINVAL; + } key = ((u64)aux->id) << 32 | btf_id; } else { + if (prog_extension) { + verbose(env, "Cannot replace kernel functions\n"); + return -EINVAL; + } key = btf_id; } @@ -9832,6 +9884,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; return 0; + default: + if (!prog_extension) + return -EINVAL; + /* fallthrough */ case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (!btf_type_is_func(t)) { @@ -9839,6 +9895,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) btf_id); return -EINVAL; } + if (prog_extension && + btf_check_type_match(env, prog, btf, t)) + return -EINVAL; t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) return -EINVAL; @@ -9862,18 +9921,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (ret < 0) goto out; if (tgt_prog) { - if (!tgt_prog->jited) { - /* for now */ - verbose(env, "Can trace only JITed BPF progs\n"); - ret = -EINVAL; - goto out; - } - if (tgt_prog->type == BPF_PROG_TYPE_TRACING) { - /* prevent cycles */ - verbose(env, "Cannot recursively attach\n"); - ret = -EINVAL; - goto out; - } if (subprog == 0) addr = (long) tgt_prog->bpf_func; else @@ -9895,8 +9942,6 @@ out: if (ret) bpf_trampoline_put(tr); return ret; - default: - return -EINVAL; } } @@ -9966,10 +10011,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, goto skip_full_check; } - ret = check_attach_btf_id(env); - if (ret) - goto skip_full_check; - env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; @@ -10006,6 +10047,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret < 0) goto skip_full_check; + ret = check_attach_btf_id(env); + if (ret) + goto skip_full_check; + ret = check_cfg(env); if (ret < 0) goto skip_full_check; -- cgit v1.2.3 From 5576b991e9c1a11d2cc21c4b94fc75ec27603896 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Jan 2020 15:36:46 -0800 Subject: bpf: Add BPF_FUNC_jiffies64 This patch adds a helper to read the 64bit jiffies. It will be used in a later patch to implement the bpf_cubic.c. The helper is inlined for jit_requested and 64 BITS_PER_LONG as the map_gen_lookup(). Other cases could be considered together with map_gen_lookup() if needed. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200122233646.903260-1-kafai@fb.com --- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 12 ++++++++++++ kernel/bpf/verifier.c | 24 ++++++++++++++++++++++++ 3 files changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 29d47aae0dd1..973a20d49749 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2137,6 +2137,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; const struct bpf_func_proto bpf_spin_lock_proto __weak; const struct bpf_func_proto bpf_spin_unlock_proto __weak; +const struct bpf_func_proto bpf_jiffies64_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index cada974c9f4e..d8b7b110a1c5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "../../lib/kstrtox.h" @@ -312,6 +313,17 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, preempt_enable(); } +BPF_CALL_0(bpf_jiffies64) +{ + return get_jiffies_64(); +} + +const struct bpf_func_proto bpf_jiffies64_proto = { + .func = bpf_jiffies64, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9fe94f64f0ec..734abaa02123 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9445,6 +9445,30 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) goto patch_call_imm; } + if (prog->jit_requested && BITS_PER_LONG == 64 && + insn->imm == BPF_FUNC_jiffies64) { + struct bpf_insn ld_jiffies_addr[2] = { + BPF_LD_IMM64(BPF_REG_0, + (unsigned long)&jiffies), + }; + + insn_buf[0] = ld_jiffies_addr[0]; + insn_buf[1] = ld_jiffies_addr[1]; + insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, + BPF_REG_0, 0); + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, + cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + patch_call_imm: fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed -- cgit v1.2.3 From 485ec2ea9cf556e9c120e07961b7b459d776a115 Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Thu, 23 Jan 2020 17:34:38 +0530 Subject: bpf, devmap: Pass lockdep expression to RCU lists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit head is traversed using hlist_for_each_entry_rcu outside an RCU read-side critical section but under the protection of dtab->index_lock. Hence, add corresponding lockdep expression to silence false-positive lockdep warnings, and harden RCU lists. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Signed-off-by: Amol Grover Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200123120437.26506-1-frextrite@gmail.com --- kernel/bpf/devmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index de630f980282..f3a44f6ca86e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -263,7 +263,8 @@ struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) struct hlist_head *head = dev_map_index_hash(dtab, key); struct bpf_dtab_netdev *dev; - hlist_for_each_entry_rcu(dev, head, index_hlist) + hlist_for_each_entry_rcu(dev, head, index_hlist, + lockdep_is_held(&dtab->index_lock)) if (dev->idx == key) return dev; -- cgit v1.2.3 From a35d16905efc6ad5523d864a5c6efcb1e657e386 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 5 Aug 2019 18:22:27 -0400 Subject: rcu: Add basic support for kfree_rcu() batching Recently a discussion about stability and performance of a system involving a high rate of kfree_rcu() calls surfaced on the list [1] which led to another discussion how to prepare for this situation. This patch adds basic batching support for kfree_rcu(). It is "basic" because we do none of the slab management, dynamic allocation, code moving or any of the other things, some of which previous attempts did [2]. These fancier improvements can be follow-up patches and there are different ideas being discussed in those regards. This is an effort to start simple, and build up from there. In the future, an extension to use kfree_bulk and possibly per-slab batching could be done to further improve performance due to cache-locality and slab-specific bulk free optimizations. By using an array of pointers, the worker thread processing the work would need to read lesser data since it does not need to deal with large rcu_head(s) any longer. Torture tests follow in the next patch and show improvements of around 5x reduction in number of grace periods on a 16 CPU system. More details and test data are in that patch. There is an implication with rcu_barrier() with this patch. Since the kfree_rcu() calls can be batched, and may not be handed yet to the RCU machinery in fact, the monitor may not have even run yet to do the queue_rcu_work(), there seems no easy way of implementing rcu_barrier() to wait for those kfree_rcu()s that are already made. So this means a kfree_rcu() followed by an rcu_barrier() does not imply that memory will be freed once rcu_barrier() returns. Another implication is higher active memory usage (although not run-away..) until the kfree_rcu() flooding ends, in comparison to without batching. More details about this are in the second patch which adds an rcuperf test. Finally, in the near future we will get rid of kfree_rcu() special casing within RCU such as in rcu_do_batch and switch everything to just batching. Currently we don't do that since timer subsystem is not yet up and we cannot schedule the kfree_rcu() monitor as the timer subsystem's lock are not initialized. That would also mean getting rid of kfree_call_rcu_nobatch() entirely. [1] http://lore.kernel.org/lkml/20190723035725-mutt-send-email-mst@kernel.org [2] https://lkml.org/lkml/2017/12/19/824 Cc: kernel-team@android.com Cc: kernel-team@lge.com Co-developed-by: Byungchul Park Signed-off-by: Byungchul Park Signed-off-by: Joel Fernandes (Google) [ paulmck: Applied 0day and Paul Walmsley feedback on ->monitor_todo. ] [ paulmck: Make it work during early boot. ] [ paulmck: Add a crude early boot self-test. ] [ paulmck: Style adjustments and experimental docbook structure header. ] Link: https://lore.kernel.org/lkml/alpine.DEB.2.21.9999.1908161931110.32497@viisi.sifive.com/T/#me9956f66cb611b95d26ae92700e1d901f46e8c59 Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/rcu/update.c | 10 +++ 2 files changed, 198 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..0af016fdbf19 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2683,19 +2683,187 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(call_rcu); + +/* Maximum number of jiffies to wait before draining a batch. */ +#define KFREE_DRAIN_JIFFIES (HZ / 50) + +/** + * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period + * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period + * @head: List of kfree_rcu() objects not yet waiting for a grace period + * @head_free: List of kfree_rcu() objects already waiting for a grace period + * @lock: Synchronize access to this structure + * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES + * @monitor_todo: Tracks whether a @monitor_work delayed work is pending + * @initialized: The @lock and @rcu_work fields have been initialized + * + * This is a per-CPU structure. The reason that it is not included in + * the rcu_data structure is to permit this code to be extracted from + * the RCU files. Such extraction could allow further optimization of + * the interactions with the slab allocators. + */ +struct kfree_rcu_cpu { + struct rcu_work rcu_work; + struct rcu_head *head; + struct rcu_head *head_free; + spinlock_t lock; + struct delayed_work monitor_work; + int monitor_todo; + bool initialized; +}; + +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); + /* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks. Until then, this - * function may only be called from __kfree_rcu(). + * This function is invoked in workqueue context after a grace period. + * It frees all the objects queued on ->head_free. */ -void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +static void kfree_rcu_work(struct work_struct *work) +{ + unsigned long flags; + struct rcu_head *head, *next; + struct kfree_rcu_cpu *krcp; + + krcp = container_of(to_rcu_work(work), struct kfree_rcu_cpu, rcu_work); + spin_lock_irqsave(&krcp->lock, flags); + head = krcp->head_free; + krcp->head_free = NULL; + spin_unlock_irqrestore(&krcp->lock, flags); + + // List "head" is now private, so traverse locklessly. + for (; head; head = next) { + next = head->next; + // Potentially optimize with kfree_bulk in future. + __rcu_reclaim(rcu_state.name, head); + cond_resched_tasks_rcu_qs(); + } +} + +/* + * Schedule the kfree batch RCU work to run in workqueue context after a GP. + * + * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES + * timeout has been reached. + */ +static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) +{ + lockdep_assert_held(&krcp->lock); + + // If a previous RCU batch is in progress, we cannot immediately + // queue another one, so return false to tell caller to retry. + if (krcp->head_free) + return false; + + krcp->head_free = krcp->head; + krcp->head = NULL; + INIT_RCU_WORK(&krcp->rcu_work, kfree_rcu_work); + queue_rcu_work(system_wq, &krcp->rcu_work); + return true; +} + +static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, + unsigned long flags) +{ + // Attempt to start a new batch. + if (queue_kfree_rcu_work(krcp)) { + // Success! Our job is done here. + spin_unlock_irqrestore(&krcp->lock, flags); + return; + } + + // Previous RCU batch still in progress, try again later. + if (!xchg(&krcp->monitor_todo, true)) + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + spin_unlock_irqrestore(&krcp->lock, flags); +} + +/* + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + * It invokes kfree_rcu_drain_unlock() to attempt to start another batch. + */ +static void kfree_rcu_monitor(struct work_struct *work) +{ + unsigned long flags; + struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, + monitor_work.work); + + spin_lock_irqsave(&krcp->lock, flags); + if (xchg(&krcp->monitor_todo, false)) + kfree_rcu_drain_unlock(krcp, flags); + else + spin_unlock_irqrestore(&krcp->lock, flags); +} + +/* + * This version of kfree_call_rcu does not do batching of kfree_rcu() requests. + * Used only by rcuperf torture test for comparison with kfree_rcu_batch(). + */ +void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, 1); } +EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch); + +/* + * Queue a request for lazy invocation of kfree() after a grace period. + * + * Each kfree_call_rcu() request is added to a batch. The batch will be drained + * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch + * will be kfree'd in workqueue context. This allows us to: + * + * 1. Batch requests together to reduce the number of grace periods during + * heavy kfree_rcu() load. + * + * 2. It makes it possible to use kfree_bulk() on a large number of + * kfree_rcu() requests thus reducing cache misses and the per-object + * overhead of kfree(). + */ +void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + unsigned long flags; + struct kfree_rcu_cpu *krcp; + + head->func = func; + + local_irq_save(flags); // For safely calling this_cpu_ptr(). + krcp = this_cpu_ptr(&krc); + if (krcp->initialized) + spin_lock(&krcp->lock); + + // Queue the object but don't yet schedule the batch. + head->func = func; + head->next = krcp->head; + krcp->head = head; + + // Set timer to drain after KFREE_DRAIN_JIFFIES. + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && + !xchg(&krcp->monitor_todo, true)) + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + + if (krcp->initialized) + spin_unlock(&krcp->lock); + local_irq_restore(flags); +} EXPORT_SYMBOL_GPL(kfree_call_rcu); +void __init kfree_rcu_scheduler_running(void) +{ + int cpu; + unsigned long flags; + + for_each_online_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + spin_lock_irqsave(&krcp->lock, flags); + if (!krcp->head || xchg(&krcp->monitor_todo, true)) { + spin_unlock_irqrestore(&krcp->lock, flags); + continue; + } + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + spin_unlock_irqrestore(&krcp->lock, flags); + } +} + /* * During early boot, any blocking grace-period wait automatically * implies a grace period. Later on, this is never the case for PREEMPT. @@ -3557,12 +3725,26 @@ static void __init rcu_dump_rcu_node_tree(void) struct workqueue_struct *rcu_gp_wq; struct workqueue_struct *rcu_par_gp_wq; +static void __init kfree_rcu_batch_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + spin_lock_init(&krcp->lock); + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); + krcp->initialized = true; + } +} + void __init rcu_init(void) { int cpu; rcu_early_boot_tests(); + kfree_rcu_batch_init(); rcu_bootup_announce(); rcu_init_geometry(); rcu_init_one(); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1861103662db..196487762b96 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -40,6 +40,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS @@ -218,6 +219,7 @@ static int __init rcu_set_runtime_mode(void) { rcu_test_sync_prims(); rcu_scheduler_active = RCU_SCHEDULER_RUNNING; + kfree_rcu_scheduler_running(); rcu_test_sync_prims(); return 0; } @@ -853,14 +855,22 @@ static void test_callback(struct rcu_head *r) DEFINE_STATIC_SRCU(early_srcu); +struct early_boot_kfree_rcu { + struct rcu_head rh; +}; + static void early_boot_test_call_rcu(void) { static struct rcu_head head; static struct rcu_head shead; + struct early_boot_kfree_rcu *rhp; call_rcu(&head, test_callback); if (IS_ENABLED(CONFIG_SRCU)) call_srcu(&early_srcu, &shead, test_callback); + rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + if (!WARN_ON_ONCE(!rhp)) + kfree_rcu(rhp, rh); } void rcu_early_boot_tests(void) -- cgit v1.2.3 From e6e78b004fa7e0ab455d46d27f218bf6ce178a18 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 30 Aug 2019 12:36:29 -0400 Subject: rcuperf: Add kfree_rcu() performance Tests This test runs kfree_rcu() in a loop to measure performance of the new kfree_rcu() batching functionality. The following table shows results when booting with arguments: rcuperf.kfree_loops=20000 rcuperf.kfree_alloc_num=8000 rcuperf.kfree_rcu_test=1 rcuperf.kfree_no_batch=X rcuperf.kfree_no_batch=X # Grace Periods Test Duration (s) X=1 (old behavior) 9133 11.5 X=0 (new behavior) 1732 12.5 On a 16 CPU system with the above boot parameters, we see that the total number of grace periods that elapse during the test drops from 9133 when not batching to 1732 when batching (a 5X improvement). The kfree_rcu() flood itself slows down a bit when batching, though, as shown. Note that the active memory consumption during the kfree_rcu() flood does increase to around 200-250MB due to the batching (from around 50MB without batching). However, this memory consumption is relatively constant. In other words, the system is able to keep up with the kfree_rcu() load. The memory consumption comes down considerably if KFREE_DRAIN_JIFFIES is increased from HZ/50 to HZ/80. A later patch will reduce memory consumption further by using multiple lists. Also, when running the test, please disable CONFIG_DEBUG_PREEMPT and CONFIG_PROVE_RCU for realistic comparisons with/without batching. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 181 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 173 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 5f884d560384..c1e25fd10f2a 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -86,6 +86,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN, "Shutdown at end of performance tests."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); +torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?"); static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); @@ -105,8 +106,8 @@ static atomic_t n_rcu_perf_writer_finished; static wait_queue_head_t shutdown_wq; static u64 t_rcu_perf_writer_started; static u64 t_rcu_perf_writer_finished; -static unsigned long b_rcu_perf_writer_started; -static unsigned long b_rcu_perf_writer_finished; +static unsigned long b_rcu_gp_test_started; +static unsigned long b_rcu_gp_test_finished; static DEFINE_PER_CPU(atomic_t, n_async_inflight); #define MAX_MEAS 10000 @@ -378,10 +379,10 @@ rcu_perf_writer(void *arg) if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { t_rcu_perf_writer_started = t; if (gp_exp) { - b_rcu_perf_writer_started = + b_rcu_gp_test_started = cur_ops->exp_completed() / 2; } else { - b_rcu_perf_writer_started = cur_ops->get_gp_seq(); + b_rcu_gp_test_started = cur_ops->get_gp_seq(); } } @@ -429,10 +430,10 @@ retry: PERFOUT_STRING("Test complete"); t_rcu_perf_writer_finished = t; if (gp_exp) { - b_rcu_perf_writer_finished = + b_rcu_gp_test_finished = cur_ops->exp_completed() / 2; } else { - b_rcu_perf_writer_finished = + b_rcu_gp_test_finished = cur_ops->get_gp_seq(); } if (shutdown) { @@ -515,8 +516,8 @@ rcu_perf_cleanup(void) t_rcu_perf_writer_finished - t_rcu_perf_writer_started, ngps, - rcuperf_seq_diff(b_rcu_perf_writer_finished, - b_rcu_perf_writer_started)); + rcuperf_seq_diff(b_rcu_gp_test_finished, + b_rcu_gp_test_started)); for (i = 0; i < nrealwriters; i++) { if (!writer_durations) break; @@ -584,6 +585,167 @@ rcu_perf_shutdown(void *arg) return -EINVAL; } +/* + * kfree_rcu() performance tests: Start a kfree_rcu() loop on all CPUs for number + * of iterations and measure total time and number of GP for all iterations to complete. + */ + +torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu()."); +torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration."); +torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees."); +torture_param(int, kfree_no_batch, 0, "Use the non-batching (slower) version of kfree_rcu()."); + +static struct task_struct **kfree_reader_tasks; +static int kfree_nrealthreads; +static atomic_t n_kfree_perf_thread_started; +static atomic_t n_kfree_perf_thread_ended; + +struct kfree_obj { + char kfree_obj[8]; + struct rcu_head rh; +}; + +static int +kfree_perf_thread(void *arg) +{ + int i, loop = 0; + long me = (long)arg; + struct kfree_obj *alloc_ptr; + u64 start_time, end_time; + + VERBOSE_PERFOUT_STRING("kfree_perf_thread task started"); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + + start_time = ktime_get_mono_fast_ns(); + + if (atomic_inc_return(&n_kfree_perf_thread_started) >= kfree_nrealthreads) { + if (gp_exp) + b_rcu_gp_test_started = cur_ops->exp_completed() / 2; + else + b_rcu_gp_test_started = cur_ops->get_gp_seq(); + } + + do { + for (i = 0; i < kfree_alloc_num; i++) { + alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL); + if (!alloc_ptr) + return -ENOMEM; + + if (!kfree_no_batch) { + kfree_rcu(alloc_ptr, rh); + } else { + rcu_callback_t cb; + + cb = (rcu_callback_t)(unsigned long)offsetof(struct kfree_obj, rh); + kfree_call_rcu_nobatch(&(alloc_ptr->rh), cb); + } + } + + cond_resched(); + } while (!torture_must_stop() && ++loop < kfree_loops); + + if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) { + end_time = ktime_get_mono_fast_ns(); + + if (gp_exp) + b_rcu_gp_test_finished = cur_ops->exp_completed() / 2; + else + b_rcu_gp_test_finished = cur_ops->get_gp_seq(); + + pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld\n", + (unsigned long long)(end_time - start_time), kfree_loops, + rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started)); + if (shutdown) { + smp_mb(); /* Assign before wake. */ + wake_up(&shutdown_wq); + } + } + + torture_kthread_stopping("kfree_perf_thread"); + return 0; +} + +static void +kfree_perf_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + if (kfree_reader_tasks) { + for (i = 0; i < kfree_nrealthreads; i++) + torture_stop_kthread(kfree_perf_thread, + kfree_reader_tasks[i]); + kfree(kfree_reader_tasks); + } + + torture_cleanup_end(); +} + +/* + * shutdown kthread. Just waits to be awakened, then shuts down system. + */ +static int +kfree_perf_shutdown(void *arg) +{ + do { + wait_event(shutdown_wq, + atomic_read(&n_kfree_perf_thread_ended) >= + kfree_nrealthreads); + } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads); + + smp_mb(); /* Wake before output. */ + + kfree_perf_cleanup(); + kernel_power_off(); + return -EINVAL; +} + +static int __init +kfree_perf_init(void) +{ + long i; + int firsterr = 0; + + kfree_nrealthreads = compute_real(kfree_nthreads); + /* Start up the kthreads. */ + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(kfree_perf_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + + kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), + GFP_KERNEL); + if (kfree_reader_tasks == NULL) { + firsterr = -ENOMEM; + goto unwind; + } + + for (i = 0; i < kfree_nrealthreads; i++) { + firsterr = torture_create_kthread(kfree_perf_thread, (void *)i, + kfree_reader_tasks[i]); + if (firsterr) + goto unwind; + } + + while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads) + schedule_timeout_uninterruptible(1); + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + kfree_perf_cleanup(); + return firsterr; +} + static int __init rcu_perf_init(void) { @@ -616,6 +778,9 @@ rcu_perf_init(void) if (cur_ops->init) cur_ops->init(); + if (kfree_rcu_test) + return kfree_perf_init(); + nrealwriters = compute_real(nwriters); nrealreaders = compute_real(nreaders); atomic_set(&n_rcu_perf_reader_started, 0); -- cgit v1.2.3 From 569d767087ef56a59bc2d1d5f25ee447eeae442a Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 22 Sep 2019 10:49:57 -0700 Subject: rcu: Make kfree_rcu() use a non-atomic ->monitor_todo Because the ->monitor_todo field is always protected by krcp->lock, this commit downgrades from xchg() to non-atomic unmarked assignment statements. Signed-off-by: Joel Fernandes [ paulmck: Update to include early-boot kick code. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0af016fdbf19..6106b9e0b5fb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2708,7 +2708,7 @@ struct kfree_rcu_cpu { struct rcu_head *head_free; spinlock_t lock; struct delayed_work monitor_work; - int monitor_todo; + bool monitor_todo; bool initialized; }; @@ -2765,6 +2765,7 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) { // Attempt to start a new batch. + krcp->monitor_todo = false; if (queue_kfree_rcu_work(krcp)) { // Success! Our job is done here. spin_unlock_irqrestore(&krcp->lock, flags); @@ -2772,8 +2773,8 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, } // Previous RCU batch still in progress, try again later. - if (!xchg(&krcp->monitor_todo, true)) - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + krcp->monitor_todo = true; + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); spin_unlock_irqrestore(&krcp->lock, flags); } @@ -2788,7 +2789,7 @@ static void kfree_rcu_monitor(struct work_struct *work) monitor_work.work); spin_lock_irqsave(&krcp->lock, flags); - if (xchg(&krcp->monitor_todo, false)) + if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); else spin_unlock_irqrestore(&krcp->lock, flags); @@ -2837,8 +2838,10 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && - !xchg(&krcp->monitor_todo, true)) + !krcp->monitor_todo) { + krcp->monitor_todo = true; schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + } if (krcp->initialized) spin_unlock(&krcp->lock); @@ -2855,10 +2858,11 @@ void __init kfree_rcu_scheduler_running(void) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); spin_lock_irqsave(&krcp->lock, flags); - if (!krcp->head || xchg(&krcp->monitor_todo, true)) { + if (!krcp->head || krcp->monitor_todo) { spin_unlock_irqrestore(&krcp->lock, flags); continue; } + krcp->monitor_todo = true; schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); spin_unlock_irqrestore(&krcp->lock, flags); } -- cgit v1.2.3 From 0392bebebf26f09434e6c7ca4c09c014efeef76a Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 19 Sep 2019 14:58:26 -0700 Subject: rcu: Add multiple in-flight batches of kfree_rcu() work During testing, it was observed that amount of memory consumed due kfree_rcu() batching is 300-400MB. Previously we had only a single head_free pointer pointing to the list of rcu_head(s) that are to be freed after a grace period. Until this list is drained, we cannot queue any more objects on it since such objects may not be ready to be reclaimed when the worker thread eventually gets to drainin g the head_free list. We can do better by maintaining multiple lists as done by this patch. Testing shows that memory consumption came down by around 100-150MB with just adding another list. Adding more than 1 additional list did not show any improvement. Suggested-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) [ paulmck: Code style and initialization handling. ] [ paulmck: Fix field name, reported by kbuild test robot . ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 51 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6106b9e0b5fb..a40fd58bd4b6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2686,12 +2686,25 @@ EXPORT_SYMBOL_GPL(call_rcu); /* Maximum number of jiffies to wait before draining a batch. */ #define KFREE_DRAIN_JIFFIES (HZ / 50) +#define KFREE_N_BATCHES 2 /** - * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period + * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period + * @head_free: List of kfree_rcu() objects waiting for a grace period + * @krcp: Pointer to @kfree_rcu_cpu structure + */ + +struct kfree_rcu_cpu_work { + struct rcu_work rcu_work; + struct rcu_head *head_free; + struct kfree_rcu_cpu *krcp; +}; + +/** + * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @head_free: List of kfree_rcu() objects already waiting for a grace period + * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @monitor_todo: Tracks whether a @monitor_work delayed work is pending @@ -2703,9 +2716,8 @@ EXPORT_SYMBOL_GPL(call_rcu); * the interactions with the slab allocators. */ struct kfree_rcu_cpu { - struct rcu_work rcu_work; struct rcu_head *head; - struct rcu_head *head_free; + struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; spinlock_t lock; struct delayed_work monitor_work; bool monitor_todo; @@ -2723,11 +2735,14 @@ static void kfree_rcu_work(struct work_struct *work) unsigned long flags; struct rcu_head *head, *next; struct kfree_rcu_cpu *krcp; + struct kfree_rcu_cpu_work *krwp; - krcp = container_of(to_rcu_work(work), struct kfree_rcu_cpu, rcu_work); + krwp = container_of(to_rcu_work(work), + struct kfree_rcu_cpu_work, rcu_work); + krcp = krwp->krcp; spin_lock_irqsave(&krcp->lock, flags); - head = krcp->head_free; - krcp->head_free = NULL; + head = krwp->head_free; + krwp->head_free = NULL; spin_unlock_irqrestore(&krcp->lock, flags); // List "head" is now private, so traverse locklessly. @@ -2747,17 +2762,25 @@ static void kfree_rcu_work(struct work_struct *work) */ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { + int i; + struct kfree_rcu_cpu_work *krwp = NULL; + lockdep_assert_held(&krcp->lock); + for (i = 0; i < KFREE_N_BATCHES; i++) + if (!krcp->krw_arr[i].head_free) { + krwp = &(krcp->krw_arr[i]); + break; + } // If a previous RCU batch is in progress, we cannot immediately // queue another one, so return false to tell caller to retry. - if (krcp->head_free) + if (!krwp) return false; - krcp->head_free = krcp->head; + krwp->head_free = krcp->head; krcp->head = NULL; - INIT_RCU_WORK(&krcp->rcu_work, kfree_rcu_work); - queue_rcu_work(system_wq, &krcp->rcu_work); + INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work); + queue_rcu_work(system_wq, &krwp->rcu_work); return true; } @@ -2863,7 +2886,8 @@ void __init kfree_rcu_scheduler_running(void) continue; } krcp->monitor_todo = true; - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + schedule_delayed_work_on(cpu, &krcp->monitor_work, + KFREE_DRAIN_JIFFIES); spin_unlock_irqrestore(&krcp->lock, flags); } } @@ -3732,11 +3756,14 @@ struct workqueue_struct *rcu_par_gp_wq; static void __init kfree_rcu_batch_init(void) { int cpu; + int i; for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); spin_lock_init(&krcp->lock); + for (i = 0; i < KFREE_N_BATCHES; i++) + krcp->krw_arr[i].krcp = krcp; INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } -- cgit v1.2.3 From e99637becb2e684bee2b9117f817f4d1346b8353 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 22 Sep 2019 13:03:17 -0700 Subject: rcu: Add support for debug_objects debugging for kfree_rcu() This commit applies RCU's debug_objects debugging to the new batched kfree_rcu() implementations. The object is queued at the kfree_rcu() call and dequeued during reclaim. Tested that enabling CONFIG_DEBUG_OBJECTS_RCU_HEAD successfully detects double kfree_rcu() calls. Signed-off-by: Joel Fernandes (Google) [ paulmck: Fix IRQ per kbuild test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a40fd58bd4b6..0512221cd84b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2749,6 +2749,7 @@ static void kfree_rcu_work(struct work_struct *work) for (; head; head = next) { next = head->next; // Potentially optimize with kfree_bulk in future. + debug_rcu_head_unqueue(head); __rcu_reclaim(rcu_state.name, head); cond_resched_tasks_rcu_qs(); } @@ -2855,6 +2856,12 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) spin_lock(&krcp->lock); // Queue the object but don't yet schedule the batch. + if (debug_rcu_head_queue(head)) { + // Probable double kfree_rcu(), just leak. + WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", + __func__, head); + goto unlock_return; + } head->func = func; head->next = krcp->head; krcp->head = head; @@ -2866,6 +2873,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); } +unlock_return: if (krcp->initialized) spin_unlock(&krcp->lock); local_irq_restore(flags); -- cgit v1.2.3 From 77a40f97030b27b3fc1640a3ed203870f0817f57 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 30 Aug 2019 12:36:32 -0400 Subject: rcu: Remove kfree_rcu() special casing and lazy-callback handling This commit removes kfree_rcu() special-casing and the lazy-callback handling from Tree RCU. It moves some of this special casing to Tiny RCU, the removal of which will be the subject of later commits. This results in a nice negative delta. Suggested-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) [ paulmck: Add slab.h #include, thanks to kbuild test robot . ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 27 -------------------------- kernel/rcu/rcu_segcblist.c | 25 +++--------------------- kernel/rcu/rcu_segcblist.h | 25 ++---------------------- kernel/rcu/srcutree.c | 4 ++-- kernel/rcu/tiny.c | 28 ++++++++++++++++++++++++++- kernel/rcu/tree.c | 40 ++++++++++++++++++++++++++------------ kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 48 +++++++++++----------------------------------- kernel/rcu/tree_stall.h | 6 ++---- 9 files changed, 75 insertions(+), 129 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ab504fbc76ca..c30a1f7dbd15 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -198,33 +198,6 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -void kfree(const void *); - -/* - * Reclaim the specified callback, either by invoking it (non-lazy case) - * or freeing it directly (lazy case). Return true if lazy, false otherwise. - */ -static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) -{ - rcu_callback_t f; - unsigned long offset = (unsigned long)head->func; - - rcu_lock_acquire(&rcu_callback_map); - if (__is_kfree_rcu_offset(offset)) { - trace_rcu_invoke_kfree_callback(rn, head, offset); - kfree((void *)head - offset); - rcu_lock_release(&rcu_callback_map); - return true; - } else { - trace_rcu_invoke_callback(rn, head); - f = head->func; - WRITE_ONCE(head->func, (rcu_callback_t)0L); - f(head); - rcu_lock_release(&rcu_callback_map); - return false; - } -} - #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_ftrace_dump; diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index cbc87b804db9..5f4fd3b8777c 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -20,14 +20,10 @@ void rcu_cblist_init(struct rcu_cblist *rclp) rclp->head = NULL; rclp->tail = &rclp->head; rclp->len = 0; - rclp->len_lazy = 0; } /* * Enqueue an rcu_head structure onto the specified callback list. - * This function assumes that the callback is non-lazy because it - * is intended for use by no-CBs CPUs, which do not distinguish - * between lazy and non-lazy RCU callbacks. */ void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp) { @@ -54,7 +50,6 @@ void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, else drclp->tail = &drclp->head; drclp->len = srclp->len; - drclp->len_lazy = srclp->len_lazy; if (!rhp) { rcu_cblist_init(srclp); } else { @@ -62,16 +57,12 @@ void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, srclp->head = rhp; srclp->tail = &rhp->next; WRITE_ONCE(srclp->len, 1); - srclp->len_lazy = 0; } } /* * Dequeue the oldest rcu_head structure from the specified callback - * list. This function assumes that the callback is non-lazy, but - * the caller can later invoke rcu_cblist_dequeued_lazy() if it - * finds otherwise (and if it cares about laziness). This allows - * different users to have different ways of determining laziness. + * list. */ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) { @@ -161,7 +152,6 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp) for (i = 0; i < RCU_CBLIST_NSEGS; i++) rsclp->tails[i] = &rsclp->head; rcu_segcblist_set_len(rsclp, 0); - rsclp->len_lazy = 0; rsclp->enabled = 1; } @@ -173,7 +163,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) { WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); - WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); rsclp->enabled = 0; } @@ -253,11 +242,9 @@ bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp) * absolutely not OK for it to ever miss posting a callback. */ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy) + struct rcu_head *rhp) { rcu_segcblist_inc_len(rsclp); - if (lazy) - rsclp->len_lazy++; smp_mb(); /* Ensure counts are updated before callback is enqueued. */ rhp->next = NULL; WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); @@ -275,15 +262,13 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, * period. You have been warned. */ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy) + struct rcu_head *rhp) { int i; if (rcu_segcblist_n_cbs(rsclp) == 0) return false; rcu_segcblist_inc_len(rsclp); - if (lazy) - rsclp->len_lazy++; smp_mb(); /* Ensure counts are updated before callback is entrained. */ rhp->next = NULL; for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) @@ -307,8 +292,6 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { - rclp->len_lazy += rsclp->len_lazy; - rsclp->len_lazy = 0; rclp->len = rcu_segcblist_xchg_len(rsclp, 0); } @@ -361,9 +344,7 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { - rsclp->len_lazy += rclp->len_lazy; rcu_segcblist_add_len(rsclp, rclp->len); - rclp->len_lazy = 0; rclp->len = 0; } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 815c2fdd3fcc..5c293afc07b8 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -15,15 +15,6 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) return READ_ONCE(rclp->len); } -/* - * Account for the fact that a previously dequeued callback turned out - * to be marked as lazy. - */ -static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) -{ - rclp->len_lazy--; -} - void rcu_cblist_init(struct rcu_cblist *rclp); void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp); void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, @@ -59,18 +50,6 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) #endif } -/* Return number of lazy callbacks in segmented callback list. */ -static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) -{ - return rsclp->len_lazy; -} - -/* Return number of lazy callbacks in segmented callback list. */ -static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) -{ - return rcu_segcblist_n_cbs(rsclp) - rsclp->len_lazy; -} - /* * Is the specified rcu_segcblist enabled, for example, not corresponding * to an offline CPU? @@ -106,9 +85,9 @@ struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp); void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy); + struct rcu_head *rhp); bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy); + struct rcu_head *rhp); void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp); void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 5dffade2d7cd..d0a9d5b69087 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -853,7 +853,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, local_irq_save(flags); sdp = this_cpu_ptr(ssp->sda); spin_lock_rcu_node(sdp); - rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); + rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_gp_seq)); s = rcu_seq_snap(&ssp->srcu_gp_seq); @@ -1052,7 +1052,7 @@ void srcu_barrier(struct srcu_struct *ssp) sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); if (!rcu_segcblist_entrain(&sdp->srcu_cblist, - &sdp->srcu_barrier_head, 0)) { + &sdp->srcu_barrier_head)) { debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&ssp->srcu_barrier_cpu_cnt); } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 477b4eb44af5..dd572ce7c747 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "rcu.h" @@ -73,6 +74,31 @@ void rcu_sched_clock_irq(int user) } } +/* + * Reclaim the specified callback, either by invoking it for non-kfree cases or + * freeing it directly (for kfree). Return true if kfreeing, false otherwise. + */ +static inline bool rcu_reclaim_tiny(struct rcu_head *head) +{ + rcu_callback_t f; + unsigned long offset = (unsigned long)head->func; + + rcu_lock_acquire(&rcu_callback_map); + if (__is_kfree_rcu_offset(offset)) { + trace_rcu_invoke_kfree_callback("", head, offset); + kfree((void *)head - offset); + rcu_lock_release(&rcu_callback_map); + return true; + } + + trace_rcu_invoke_callback("", head); + f = head->func; + WRITE_ONCE(head->func, (rcu_callback_t)0L); + f(head); + rcu_lock_release(&rcu_callback_map); + return false; +} + /* Invoke the RCU callbacks whose grace period has elapsed. */ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { @@ -100,7 +126,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); - __rcu_reclaim("", list); + rcu_reclaim_tiny(list); local_bh_enable(); list = next; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0512221cd84b..a8dd612098bf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include "../time/tick-internal.h" @@ -2146,7 +2147,6 @@ static void rcu_do_batch(struct rcu_data *rdp) /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { trace_rcu_batch_start(rcu_state.name, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), 0); trace_rcu_batch_end(rcu_state.name, 0, !rcu_segcblist_empty(&rdp->cblist), @@ -2168,7 +2168,6 @@ static void rcu_do_batch(struct rcu_data *rdp) if (unlikely(bl > 100)) tlimit = local_clock() + rcu_resched_ns; trace_rcu_batch_start(rcu_state.name, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); if (offloaded) @@ -2179,9 +2178,19 @@ static void rcu_do_batch(struct rcu_data *rdp) tick_dep_set_task(current, TICK_DEP_BIT_RCU); rhp = rcu_cblist_dequeue(&rcl); for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { + rcu_callback_t f; + debug_rcu_head_unqueue(rhp); - if (__rcu_reclaim(rcu_state.name, rhp)) - rcu_cblist_dequeued_lazy(&rcl); + + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_callback(rcu_state.name, rhp); + + f = rhp->func; + WRITE_ONCE(rhp->func, (rcu_callback_t)0L); + f(rhp); + + rcu_lock_release(&rcu_callback_map); + /* * Stop only if limit reached and CPU has something to do. * Note: The rcl structure counts down from zero. @@ -2583,7 +2592,7 @@ static void rcu_leak_callback(struct rcu_head *rhp) * is expected to specify a CPU. */ static void -__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) +__call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct rcu_data *rdp; @@ -2618,18 +2627,17 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); } + if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) return; // Enqueued onto ->nocb_bypass, so just leave. /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */ - rcu_segcblist_enqueue(&rdp->cblist, head, lazy); + rcu_segcblist_enqueue(&rdp->cblist, head); if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rcu_state.name, head, (unsigned long)func, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist)); else trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist)); /* Go handle any RCU core processing required. */ @@ -2679,7 +2687,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, 0); + __call_rcu(head, func); } EXPORT_SYMBOL_GPL(call_rcu); @@ -2747,10 +2755,18 @@ static void kfree_rcu_work(struct work_struct *work) // List "head" is now private, so traverse locklessly. for (; head; head = next) { + unsigned long offset = (unsigned long)head->func; + next = head->next; // Potentially optimize with kfree_bulk in future. debug_rcu_head_unqueue(head); - __rcu_reclaim(rcu_state.name, head); + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); + + /* Could be possible to optimize with kfree_bulk in future */ + kfree((void *)head - offset); + + rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); } } @@ -2825,7 +2841,7 @@ static void kfree_rcu_monitor(struct work_struct *work) */ void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, 1); + __call_rcu(head, func); } EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch); @@ -3100,7 +3116,7 @@ static void rcu_barrier_func(void *unused) debug_rcu_head_queue(&rdp->barrier_head); rcu_nocb_lock(rdp); WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); - if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { + if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) { atomic_inc(&rcu_state.barrier_cpu_count); } else { debug_rcu_head_unqueue(&rdp->barrier_head); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 055c31781d3a..15405420b40c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -183,7 +183,6 @@ struct rcu_data { bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ #ifdef CONFIG_RCU_FAST_NO_HZ - bool all_lazy; /* All CPU's CBs lazy at idle start? */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fa08d55f7040..d5334e49ccca 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1262,10 +1262,9 @@ static void rcu_prepare_for_idle(void) /* * This code is invoked when a CPU goes idle, at which point we want * to have the CPU do everything required for RCU so that it can enter - * the energy-efficient dyntick-idle mode. This is handled by a - * state machine implemented by rcu_prepare_for_idle() below. + * the energy-efficient dyntick-idle mode. * - * The following three proprocessor symbols control this state machine: + * The following preprocessor symbol controls this: * * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted * to sleep in dyntick-idle mode with RCU callbacks pending. This @@ -1274,21 +1273,15 @@ static void rcu_prepare_for_idle(void) * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your * system. And if you are -that- concerned about energy efficiency, * just power the system down and be done with it! - * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is - * permitted to sleep in dyntick-idle mode with only lazy RCU - * callbacks pending. Setting this too high can OOM your system. * - * The values below work well in practice. If future workloads require + * The value below works well in practice. If future workloads require * adjustment, they can be converted into kernel config parameters, though * making the state machine smarter might be a better option. */ #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ -#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; module_param(rcu_idle_gp_delay, int, 0644); -static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; -module_param(rcu_idle_lazy_gp_delay, int, 0644); /* * Try to advance callbacks on the current CPU, but only if it has been @@ -1327,8 +1320,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) /* * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready * to invoke. If the CPU has callbacks, try to advance them. Tell the - * caller to set the timeout based on whether or not there are non-lazy - * callbacks. + * caller about what to set the timeout. * * The caller must have disabled interrupts. */ @@ -1354,25 +1346,18 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) } rdp->last_accelerate = jiffies; - /* Request timer delay depending on laziness, and round. */ - rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist); - if (rdp->all_lazy) { - dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; - } else { - dj = round_up(rcu_idle_gp_delay + jiffies, - rcu_idle_gp_delay) - jiffies; - } + /* Request timer and round. */ + dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; + *nextevt = basemono + dj * TICK_NSEC; return 0; } /* - * Prepare a CPU for idle from an RCU perspective. The first major task - * is to sense whether nohz mode has been enabled or disabled via sysfs. - * The second major task is to check to see if a non-lazy callback has - * arrived at a CPU that previously had only lazy callbacks. The third - * major task is to accelerate (that is, assign grace-period numbers to) - * any recently arrived callbacks. + * Prepare a CPU for idle from an RCU perspective. The first major task is to + * sense whether nohz mode has been enabled or disabled via sysfs. The second + * major task is to accelerate (that is, assign grace-period numbers to) any + * recently arrived callbacks. * * The caller must have disabled interrupts. */ @@ -1398,17 +1383,6 @@ static void rcu_prepare_for_idle(void) if (!tne) return; - /* - * If a non-lazy callback arrived at a CPU having only lazy - * callbacks, invoke RCU core for the side-effect of recalculating - * idle duration on re-entry to idle. - */ - if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) { - rdp->all_lazy = false; - invoke_rcu_core(); - return; - } - /* * If we have not yet accelerated this jiffy, accelerate all * callbacks on this CPU. diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c0b8c458d8a6..806f2ddc8f74 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -263,11 +263,9 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", + sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d", rdp->last_accelerate & 0xffff, jiffies & 0xffff, - ".l"[rdp->all_lazy], - ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], - ".D"[!!rdp->tick_nohz_enabled_snap]); + !!rdp->tick_nohz_enabled_snap); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ -- cgit v1.2.3 From 189a6883dcf7fa70e17403ae4225c60ffc9e404b Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 30 Aug 2019 12:36:33 -0400 Subject: rcu: Remove kfree_call_rcu_nobatch() Now that the kfree_rcu() special-casing has been removed from tree RCU, this commit removes kfree_call_rcu_nobatch() since it is no longer needed. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 10 +--------- kernel/rcu/tree.c | 18 ++++-------------- 2 files changed, 5 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index c1e25fd10f2a..da94b89cd531 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -593,7 +593,6 @@ rcu_perf_shutdown(void *arg) torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu()."); torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration."); torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees."); -torture_param(int, kfree_no_batch, 0, "Use the non-batching (slower) version of kfree_rcu()."); static struct task_struct **kfree_reader_tasks; static int kfree_nrealthreads; @@ -632,14 +631,7 @@ kfree_perf_thread(void *arg) if (!alloc_ptr) return -ENOMEM; - if (!kfree_no_batch) { - kfree_rcu(alloc_ptr, rh); - } else { - rcu_callback_t cb; - - cb = (rcu_callback_t)(unsigned long)offsetof(struct kfree_obj, rh); - kfree_call_rcu_nobatch(&(alloc_ptr->rh), cb); - } + kfree_rcu(alloc_ptr, rh); } cond_resched(); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a8dd612098bf..31d2d9255d95 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2763,8 +2763,10 @@ static void kfree_rcu_work(struct work_struct *work) rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); - /* Could be possible to optimize with kfree_bulk in future */ - kfree((void *)head - offset); + if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) { + /* Could be optimized with kfree_bulk() in future. */ + kfree((void *)head - offset); + } rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -2835,16 +2837,6 @@ static void kfree_rcu_monitor(struct work_struct *work) spin_unlock_irqrestore(&krcp->lock, flags); } -/* - * This version of kfree_call_rcu does not do batching of kfree_rcu() requests. - * Used only by rcuperf torture test for comparison with kfree_rcu_batch(). - */ -void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func) -{ - __call_rcu(head, func); -} -EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch); - /* * Queue a request for lazy invocation of kfree() after a grace period. * @@ -2864,8 +2856,6 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) unsigned long flags; struct kfree_rcu_cpu *krcp; - head->func = func; - local_irq_save(flags); // For safely calling this_cpu_ptr(). krcp = this_cpu_ptr(&krc); if (krcp->initialized) -- cgit v1.2.3 From c130d2dc93cd03323494d82dbe7b5fb0d101ab62 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:28:48 +0000 Subject: rcu: Rename some instance of CONFIG_PREEMPTION to CONFIG_PREEMPT_RCU CONFIG_PREEMPTION and CONFIG_PREEMPT_RCU are always identical, but some code depends on CONFIG_PREEMPTION to access to rcu_preempt functionality. This patch changes CONFIG_PREEMPTION to CONFIG_PREEMPT_RCU in these cases. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_stall.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c9dbb05e4c13..5445da2326a0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1934,7 +1934,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) || + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) || WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2294,7 +2294,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPTION) || + if (!IS_ENABLED(CONFIG_PREEMPT_RCU) || rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c0b8c458d8a6..a6652efedd48 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp) // // Printing RCU CPU stall warnings -#ifdef CONFIG_PREEMPTION +#ifdef CONFIG_PREEMPT_RCU /* * Dump detailed information for all tasks blocking the current RCU @@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) return ndetected; } -#else /* #ifdef CONFIG_PREEMPTION */ +#else /* #ifdef CONFIG_PREEMPT_RCU */ /* * Because preemptible RCU does not exist, we never have to check for @@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) { return 0; } -#endif /* #else #ifdef CONFIG_PREEMPTION */ +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* * Dump stacks of all tasks running on stalled CPUs. First try using -- cgit v1.2.3 From 2eeba5838fd8c5e19bb91e25624116936348e7af Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 1 Nov 2019 04:06:22 -0700 Subject: rcu: Clear .exp_hint only when deferred quiescent state has been reported Currently, the .exp_hint flag is cleared in rcu_read_unlock_special(), which works, but which can also prevent subsequent rcu_read_unlock() calls from helping expedite the quiescent state needed by an ongoing expedited RCU grace period. This commit therefore defers clearing of .exp_hint from rcu_read_unlock_special() to rcu_preempt_deferred_qs_irqrestore(), thus ensuring that intervening calls to rcu_read_unlock() have a chance to help end the expedited grace period. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8cdce111ea73..7487c7930a47 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -444,6 +444,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) local_irq_restore(flags); return; } + t->rcu_read_unlock_special.b.exp_hint = false; t->rcu_read_unlock_special.b.deferred_qs = false; if (special.b.need_qs) { rcu_qs(); @@ -610,7 +611,6 @@ static void rcu_read_unlock_special(struct task_struct *t) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - t->rcu_read_unlock_special.b.exp_hint = false; exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || (rdp->grpmask & rnp->expmask) || tick_nohz_full_cpu(rdp->cpu); @@ -640,7 +640,6 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_restore(flags); return; } - WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); rcu_preempt_deferred_qs_irqrestore(t, flags); } -- cgit v1.2.3 From 3717e1e9f25ec7059e421ab6fc602cab7063c11c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 1 Nov 2019 05:06:21 -0700 Subject: rcu: Clear ->rcu_read_unlock_special only once In rcu_preempt_deferred_qs_irqrestore(), ->rcu_read_unlock_special is cleared one piece at a time. Given that the "if" statements in this function use the copy in "special", this commit removes the clearing of the individual pieces in favor of clearing ->rcu_read_unlock_special in one go just after it has been determined to be non-zero. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7487c7930a47..c3a32717c42d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -444,16 +444,9 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) local_irq_restore(flags); return; } - t->rcu_read_unlock_special.b.exp_hint = false; - t->rcu_read_unlock_special.b.deferred_qs = false; - if (special.b.need_qs) { + t->rcu_read_unlock_special.s = 0; + if (special.b.need_qs) rcu_qs(); - t->rcu_read_unlock_special.b.need_qs = false; - if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) { - local_irq_restore(flags); - return; - } - } /* * Respond to a request by an expedited grace period for a @@ -461,17 +454,11 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * tasks are handled when removing the task from the * blocked-tasks list below. */ - if (rdp->exp_deferred_qs) { + if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); - if (!t->rcu_read_unlock_special.s) { - local_irq_restore(flags); - return; - } - } /* Clean up if blocked during RCU read-side critical section. */ if (special.b.blocked) { - t->rcu_read_unlock_special.b.blocked = false; /* * Remove this task from the list it blocked on. The task -- cgit v1.2.3 From c51f83c315c392d9776c33eb16a2fe1349d65c7f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Nov 2019 08:22:45 -0800 Subject: rcu: Use READ_ONCE() for ->expmask in rcu_read_unlock_special() The rcu_node structure's ->expmask field is updated only when holding the ->lock, but is also accessed locklessly. This means that all ->expmask updates must use WRITE_ONCE() and all reads carried out without holding ->lock must use READ_ONCE(). This commit therefore changes the lockless ->expmask read in rcu_read_unlock_special() to use READ_ONCE(). Reported-by: syzbot+99f4ddade3c22ab0cf23@syzkaller.appspotmail.com Signed-off-by: Paul E. McKenney Acked-by: Marco Elver --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c3a32717c42d..698a7f195f88 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -599,7 +599,7 @@ static void rcu_read_unlock_special(struct task_struct *t) struct rcu_node *rnp = rdp->mynode; exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || - (rdp->grpmask & rnp->expmask) || + (rdp->grpmask & READ_ONCE(rnp->expmask)) || tick_nohz_full_cpu(rdp->cpu); // Need to defer quiescent state until everything is enabled. if (irqs_were_disabled && use_softirq && -- cgit v1.2.3 From 77339e61aa309310a535bd01eb3388f7a27b36f9 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 15 Nov 2019 14:08:53 -0800 Subject: rcu: Provide wrappers for uses of ->rcu_read_lock_nesting This commit provides wrapper functions for uses of ->rcu_read_lock_nesting to improve readability and to ease future changes to support inlining of __rcu_read_lock() and __rcu_read_unlock(). Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 4 ++-- kernel/rcu/tree_plugin.h | 53 +++++++++++++++++++++++++++++++----------------- 2 files changed, 36 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 98d078cafa5a..d8da6b1a3209 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -610,7 +610,7 @@ static void rcu_exp_handler(void *unused) * critical section. If also enabled or idle, immediately * report the quiescent state, otherwise defer. */ - if (!t->rcu_read_lock_nesting) { + if (!rcu_preempt_depth()) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); @@ -634,7 +634,7 @@ static void rcu_exp_handler(void *unused) * can have caused this quiescent state to already have been * reported, so we really do need to check ->expmask. */ - if (t->rcu_read_lock_nesting > 0) { + if (rcu_preempt_depth() > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { rdp->exp_deferred_qs = true; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 698a7f195f88..ebdbdec5911f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -290,8 +290,8 @@ void rcu_note_context_switch(bool preempt) trace_rcu_utilization(TPS("Start context switch")); lockdep_assert_irqs_disabled(); - WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); - if (t->rcu_read_lock_nesting > 0 && + WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0); + if (rcu_preempt_depth() > 0 && !t->rcu_read_unlock_special.b.blocked) { /* Possibly blocking in an RCU read-side critical section. */ @@ -348,6 +348,21 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) #define RCU_NEST_NMAX (-INT_MAX / 2) #define RCU_NEST_PMAX (INT_MAX / 2) +static void rcu_preempt_read_enter(void) +{ + current->rcu_read_lock_nesting++; +} + +static void rcu_preempt_read_exit(void) +{ + current->rcu_read_lock_nesting--; +} + +static void rcu_preempt_depth_set(int val) +{ + current->rcu_read_lock_nesting = val; +} + /* * Preemptible RCU implementation for rcu_read_lock(). * Just increment ->rcu_read_lock_nesting, shared state will be updated @@ -355,9 +370,9 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) */ void __rcu_read_lock(void) { - current->rcu_read_lock_nesting++; + rcu_preempt_read_enter(); if (IS_ENABLED(CONFIG_PROVE_LOCKING)) - WARN_ON_ONCE(current->rcu_read_lock_nesting > RCU_NEST_PMAX); + WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX); barrier(); /* critical section after entry code. */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); @@ -373,19 +388,19 @@ void __rcu_read_unlock(void) { struct task_struct *t = current; - if (t->rcu_read_lock_nesting != 1) { - --t->rcu_read_lock_nesting; + if (rcu_preempt_depth() != 1) { + rcu_preempt_read_exit(); } else { barrier(); /* critical section before exit code. */ - t->rcu_read_lock_nesting = -RCU_NEST_BIAS; + rcu_preempt_depth_set(-RCU_NEST_BIAS); barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; + rcu_preempt_depth_set(0); } if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { - int rrln = t->rcu_read_lock_nesting; + int rrln = rcu_preempt_depth(); WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX); } @@ -539,7 +554,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return (__this_cpu_read(rcu_data.exp_deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && - t->rcu_read_lock_nesting <= 0; + rcu_preempt_depth() <= 0; } /* @@ -552,16 +567,16 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) static void rcu_preempt_deferred_qs(struct task_struct *t) { unsigned long flags; - bool couldrecurse = t->rcu_read_lock_nesting >= 0; + bool couldrecurse = rcu_preempt_depth() >= 0; if (!rcu_preempt_need_deferred_qs(t)) return; if (couldrecurse) - t->rcu_read_lock_nesting -= RCU_NEST_BIAS; + rcu_preempt_depth_set(rcu_preempt_depth() - RCU_NEST_BIAS); local_irq_save(flags); rcu_preempt_deferred_qs_irqrestore(t, flags); if (couldrecurse) - t->rcu_read_lock_nesting += RCU_NEST_BIAS; + rcu_preempt_depth_set(rcu_preempt_depth() + RCU_NEST_BIAS); } /* @@ -672,7 +687,7 @@ static void rcu_flavor_sched_clock_irq(int user) if (user || rcu_is_cpu_rrupt_from_idle()) { rcu_note_voluntary_context_switch(current); } - if (t->rcu_read_lock_nesting > 0 || + if (rcu_preempt_depth() > 0 || (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { /* No QS, force context switch if deferred. */ if (rcu_preempt_need_deferred_qs(t)) { @@ -682,13 +697,13 @@ static void rcu_flavor_sched_clock_irq(int user) } else if (rcu_preempt_need_deferred_qs(t)) { rcu_preempt_deferred_qs(t); /* Report deferred QS. */ return; - } else if (!t->rcu_read_lock_nesting) { + } else if (!rcu_preempt_depth()) { rcu_qs(); /* Report immediate QS. */ return; } /* If GP is oldish, ask for help from rcu_read_unlock_special(). */ - if (t->rcu_read_lock_nesting > 0 && + if (rcu_preempt_depth() > 0 && __this_cpu_read(rcu_data.core_needs_qs) && __this_cpu_read(rcu_data.cpu_no_qs.b.norm) && !t->rcu_read_unlock_special.b.need_qs && @@ -709,11 +724,11 @@ void exit_rcu(void) struct task_struct *t = current; if (unlikely(!list_empty(¤t->rcu_node_entry))) { - t->rcu_read_lock_nesting = 1; + rcu_preempt_depth_set(1); barrier(); WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true); - } else if (unlikely(t->rcu_read_lock_nesting)) { - t->rcu_read_lock_nesting = 1; + } else if (unlikely(rcu_preempt_depth())) { + rcu_preempt_depth_set(1); } else { return; } -- cgit v1.2.3 From 5b14557b073c96a7cf79adc4d7b6c4a8c26b2a43 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 Nov 2019 18:05:45 -0800 Subject: rcu: Avoid tick_dep_set_cpu() misordering In the current code, rcu_nmi_enter_common() might decide to turn on the tick using tick_dep_set_cpu(), but be delayed just before doing so. Then the grace-period kthread might notice that the CPU in question had in fact gone through a quiescent state, thus turning off the tick using tick_dep_clear_cpu(). The later invocation of tick_dep_set_cpu() would then incorrectly leave the tick on. This commit therefore enlists the aid of the leaf rcu_node structure's ->lock to ensure that decisions to enable or disable the tick are carried out before they can be reversed. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5445da2326a0..b0e0612392a9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -800,8 +800,8 @@ void rcu_user_exit(void) */ static __always_inline void rcu_nmi_enter_common(bool irq) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); long incby = 2; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); /* Complain about underflow. */ WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0); @@ -828,8 +828,13 @@ static __always_inline void rcu_nmi_enter_common(bool irq) } else if (tick_nohz_full_cpu(rdp->cpu) && rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { - rdp->rcu_forced_tick = true; - tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + raw_spin_lock_rcu_node(rdp->mynode); + // Recheck under lock. + if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + rdp->rcu_forced_tick = true; + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } + raw_spin_unlock_rcu_node(rdp->mynode); } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, @@ -898,6 +903,7 @@ void rcu_irq_enter_irqson(void) */ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) { + raw_lockdep_assert_held_rcu_node(rdp->mynode); WRITE_ONCE(rdp->rcu_urgent_qs, false); WRITE_ONCE(rdp->rcu_need_heavy_qs, false); if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { -- cgit v1.2.3 From 822175e72995a9ff7eeef4f5cd3f945f2697b67d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:23:56 +0000 Subject: rcu: Fix harmless omission of "CONFIG_" from #if condition The C preprocessor macros SRCU and TINY_RCU should instead be CONFIG_SRCU and CONFIG_TINY_RCU, respectively in the #f in kernel/rcu/rcu.h. But there is no harm when "TINY_RCU" is wrongly used, which are always non-defined, which makes "!defined(TINY_RCU)" always true, which means the code block is always included, and the included code block doesn't cause any compilation error so far in CONFIG_TINY_RCU builds. It is also the reason this change should not be taken in -stable. This commit adds the needed "CONFIG_" prefix to both macros. Not for -stable. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ab504fbc76ca..473259422547 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -281,7 +281,7 @@ void rcu_test_sync_prims(void); */ extern void resched_cpu(int cpu); -#if defined(SRCU) || !defined(TINY_RCU) +#if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) #include @@ -418,7 +418,7 @@ do { \ #define raw_lockdep_assert_held_rcu_node(p) \ lockdep_assert_held(&ACCESS_PRIVATE(p, lock)) -#endif /* #if defined(SRCU) || !defined(TINY_RCU) */ +#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */ #ifdef CONFIG_SRCU void srcu_init(void); -- cgit v1.2.3 From 2488a5e695564897a0f8ea42cd3af71647cd26d2 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:23:57 +0000 Subject: rcu: Fix tracepoint tracking RCU CPU kthread utilization In the call to trace_rcu_utilization() at the start of the loop in rcu_cpu_kthread(), "rcu_wait" is incorrect, plus this trace event needs to be hoisted above the loop to balance with either the "rcu_wait" or "rcu_yield", depending on how the loop exits. This commit therefore makes these changes. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index dd8cfc34f4da..ba154a3080fb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2474,8 +2474,8 @@ static void rcu_cpu_kthread(unsigned int cpu) char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); int spincnt; + trace_rcu_utilization(TPS("Start CPU kthread@rcu_run")); for (spincnt = 0; spincnt < 10; spincnt++) { - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); local_bh_disable(); *statusp = RCU_KTHREAD_RUNNING; local_irq_disable(); -- cgit v1.2.3 From 4778339df0ee361dbf2cbdcb87abaec9dfbc841d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:28:46 +0000 Subject: rcu: Remove the declaration of call_rcu() in tree.h The call_rcu() function is an external RCU API that is declared in include/linux/rcupdate.h. There is thus no point in redeclaring it in kernel/rcu/tree.h, so this commit removes that redundant declaration. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e4dc5debfc84..54ff9896ae31 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -413,7 +413,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_flavor_sched_clock_irq(int user); -void call_rcu(struct rcu_head *head, rcu_callback_t func); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); -- cgit v1.2.3 From e2167b38c87a0c9e85c342a823dae1e6f67b11d9 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:28:47 +0000 Subject: rcu: Move gp_state_names[] and gp_state_getname() to tree_stall.h Only tree_stall.h needs to get name from GP state, so this commit moves the gp_state_names[] array and the gp_state_getname() from kernel/rcu/tree.h and kernel/rcu/tree.c, respectively, to kernel/rcu/tree_stall.h. While moving gp_state_names[], this commit uses the GCC syntax to ensure that the right string is associated with the right CPP macro. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 ---------- kernel/rcu/tree.h | 12 ------------ kernel/rcu/tree_stall.h | 22 ++++++++++++++++++++++ 3 files changed, 22 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ba154a3080fb..bbb60ed310b1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -528,16 +528,6 @@ static struct rcu_node *rcu_get_root(void) return &rcu_state.node[0]; } -/* - * Convert a ->gp_state value to a character string. - */ -static const char *gp_state_getname(short gs) -{ - if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) - return "???"; - return gp_state_names[gs]; -} - /* * Send along grace-period-related data for rcutorture diagnostics. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 54ff9896ae31..9d5986abfc67 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -368,18 +368,6 @@ struct rcu_state { #define RCU_GP_CLEANUP 7 /* Grace-period cleanup started. */ #define RCU_GP_CLEANED 8 /* Grace-period cleanup complete. */ -static const char * const gp_state_names[] = { - "RCU_GP_IDLE", - "RCU_GP_WAIT_GPS", - "RCU_GP_DONE_GPS", - "RCU_GP_ONOFF", - "RCU_GP_INIT", - "RCU_GP_WAIT_FQS", - "RCU_GP_DOING_FQS", - "RCU_GP_CLEANUP", - "RCU_GP_CLEANED", -}; - /* * In order to export the rcu_state name to the tracing tools, it * needs to be added in the __tracepoint_string section. diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c0b8c458d8a6..f18adaf3bf39 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -279,6 +279,28 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ +static const char * const gp_state_names[] = { + [RCU_GP_IDLE] = "RCU_GP_IDLE", + [RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS", + [RCU_GP_DONE_GPS] = "RCU_GP_DONE_GPS", + [RCU_GP_ONOFF] = "RCU_GP_ONOFF", + [RCU_GP_INIT] = "RCU_GP_INIT", + [RCU_GP_WAIT_FQS] = "RCU_GP_WAIT_FQS", + [RCU_GP_DOING_FQS] = "RCU_GP_DOING_FQS", + [RCU_GP_CLEANUP] = "RCU_GP_CLEANUP", + [RCU_GP_CLEANED] = "RCU_GP_CLEANED", +}; + +/* + * Convert a ->gp_state value to a character string. + */ +static const char *gp_state_getname(short gs) +{ + if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) + return "???"; + return gp_state_names[gs]; +} + /* * Print out diagnostic information for the specified stalled CPU. * -- cgit v1.2.3 From e1350e8e0ea5d959c23c5e593ff3026a67dbb049 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 15 Oct 2019 14:48:22 +0100 Subject: rcu: Move rcu_{expedited,normal} definitions into rcupdate.h This commit moves the rcu_{expedited,normal} definitions from kernel/rcu/update.c to include/linux/rcupdate.h to make sure they are in sync, and also to avoid the following warning from sparse: kernel/ksysfs.c:150:5: warning: symbol 'rcu_expedited' was not declared. Should it be static? kernel/ksysfs.c:167:5: warning: symbol 'rcu_normal' was not declared. Should it be static? Signed-off-by: Ben Dooks Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1861103662db..294d357abd0c 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -51,9 +51,7 @@ #define MODULE_PARAM_PREFIX "rcupdate." #ifndef CONFIG_TINY_RCU -extern int rcu_expedited; /* from sysctl */ module_param(rcu_expedited, int, 0); -extern int rcu_normal; /* from sysctl */ module_param(rcu_normal, int, 0); static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); -- cgit v1.2.3 From 7441e7661d6586ae36329b7956e4d713d81e9903 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Oct 2019 09:37:11 -0700 Subject: rcu: Switch force_qs_rnp() to for_each_leaf_node_cpu_mask() Currently, force_qs_rnp() uses a for_each_leaf_node_possible_cpu() loop containing a check of the current CPU's bit in ->qsmask. This works, but this commit saves three lines by instead using for_each_leaf_node_cpu_mask(), which combines the functionality of for_each_leaf_node_possible_cpu() and leaf_node_cpu_bit(). This commit also replaces the use of the local variable "bit" with rdp->grpmask. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bbb60ed310b1..d95076498488 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2298,14 +2298,11 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); continue; } - for_each_leaf_node_possible_cpu(rnp, cpu) { - unsigned long bit = leaf_node_cpu_bit(rnp, cpu); - if ((rnp->qsmask & bit) != 0) { - rdp = per_cpu_ptr(&rcu_data, cpu); - if (f(rdp)) { - mask |= bit; - rcu_disable_urgency_upon_qs(rdp); - } + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (f(rdp)) { + mask |= rdp->grpmask; + rcu_disable_urgency_upon_qs(rdp); } } if (mask != 0) { -- cgit v1.2.3 From 844a378de3372c923909681706d62336d702531e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Nov 2019 08:08:30 -0800 Subject: srcu: Apply *_ONCE() to ->srcu_last_gp_end The ->srcu_last_gp_end field is accessed from any CPU at any time by synchronize_srcu(), so non-initialization references need to use READ_ONCE() and WRITE_ONCE(). This commit therefore makes that change. Reported-by: syzbot+08f3e9d26e5541e1ecf2@syzkaller.appspotmail.com Acked-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 5dffade2d7cd..21acdff3bd27 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -530,7 +530,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) idx = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); cbdelay = srcu_get_delay(ssp); - ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) @@ -762,6 +762,7 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) unsigned long flags; struct srcu_data *sdp; unsigned long t; + unsigned long tlast; /* If the local srcu_data structure has callbacks, not idle. */ local_irq_save(flags); @@ -780,9 +781,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) /* First, see if enough time has passed since the last GP. */ t = ktime_get_mono_fast_ns(); + tlast = READ_ONCE(ssp->srcu_last_gp_end); if (exp_holdoff == 0 || - time_in_range_open(t, ssp->srcu_last_gp_end, - ssp->srcu_last_gp_end + exp_holdoff)) + time_in_range_open(t, tlast, tlast + exp_holdoff)) return false; /* Too soon after last GP. */ /* Next, check for probable idleness. */ -- cgit v1.2.3 From f6105fc2a9c0ea5be6b9e5c19b2551af0b8b4eac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Nov 2019 11:36:07 -0800 Subject: rcu: Remove unused stop-machine #include Long ago, RCU used the stop-machine mechanism to implement expedited grace periods, but no longer does so. This commit therefore removes the no-longer-needed #includes of linux/stop_machine.h. Link: https://lwn.net/Articles/805317/ Reported-by: Viresh Kumar Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 - kernel/rcu/tree.h | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d95076498488..878f62f218e9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9d5986abfc67..ce90c68c184b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -16,7 +16,6 @@ #include #include #include -#include #include #include "rcu_segcblist.h" -- cgit v1.2.3 From 5671d814dbd204b4ecc705045b5f1a647bff6f3b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 17 Jan 2020 10:01:35 +0100 Subject: smp: Use smp_cond_func_t as type for the conditional function Use a typdef for the conditional function instead defining it each time in the function prototype. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20200117090137.1205765-2-bigeasy@linutronix.de --- kernel/smp.c | 11 +++++------ kernel/up.c | 11 +++++------ 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 7dbcb402c2fc..c64044d68bc6 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -680,9 +680,9 @@ EXPORT_SYMBOL(on_each_cpu_mask); * You must not call this function with disabled interrupts or * from a hardware interrupt handler or from a bottom half handler. */ -void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), - smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags, const struct cpumask *mask) +void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, + void *info, bool wait, gfp_t gfp_flags, + const struct cpumask *mask) { cpumask_var_t cpus; int cpu, ret; @@ -714,9 +714,8 @@ void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), } EXPORT_SYMBOL(on_each_cpu_cond_mask); -void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), - smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags) +void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, + void *info, bool wait, gfp_t gfp_flags) { on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, cpu_online_mask); diff --git a/kernel/up.c b/kernel/up.c index 862b460ab97a..5c0d4f2bece2 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask); * Preemption is disabled here to make sure the cond_func is called under the * same condtions in UP and SMP. */ -void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), - smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags, const struct cpumask *mask) +void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, + void *info, bool wait, gfp_t gfp_flags, + const struct cpumask *mask) { unsigned long flags; @@ -84,9 +84,8 @@ void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), } EXPORT_SYMBOL(on_each_cpu_cond_mask); -void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), - smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags) +void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, + void *info, bool wait, gfp_t gfp_flags) { on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL); } -- cgit v1.2.3 From 67719ef25eeb2048b11befa6a757aeb3848b5df1 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 17 Jan 2020 10:01:36 +0100 Subject: smp: Add a smp_cond_func_t argument to smp_call_function_many() on_each_cpu_cond_mask() allocates a new CPU mask. The newly allocated mask is a subset of the provided mask based on the conditional function. This memory allocation can be avoided by extending smp_call_function_many() with the conditional function and performing the remote function call based on the mask and the conditional function. Rename smp_call_function_many() to smp_call_function_many_cond() and add the smp_cond_func_t argument. If smp_cond_func_t is provided then it is used before invoking the function. Provide smp_call_function_many() with cond_func set to NULL. Let on_each_cpu_cond_mask() use smp_call_function_many_cond(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20200117090137.1205765-3-bigeasy@linutronix.de --- kernel/smp.c | 81 ++++++++++++++++++++++++++++-------------------------------- 1 file changed, 38 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index c64044d68bc6..e17e6344ab54 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -395,22 +395,9 @@ call: } EXPORT_SYMBOL_GPL(smp_call_function_any); -/** - * smp_call_function_many(): Run a function on a set of other CPUs. - * @mask: The set of cpus to run on (only runs on online subset). - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed - * on other CPUs. - * - * If @wait is true, then returns once @func has returned. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. Preemption - * must be disabled when calling this function. - */ -void smp_call_function_many(const struct cpumask *mask, - smp_call_func_t func, void *info, bool wait) +static void smp_call_function_many_cond(const struct cpumask *mask, + smp_call_func_t func, void *info, + bool wait, smp_cond_func_t cond_func) { struct call_function_data *cfd; int cpu, next_cpu, this_cpu = smp_processor_id(); @@ -448,7 +435,8 @@ void smp_call_function_many(const struct cpumask *mask, /* Fastpath: do that cpu by itself. */ if (next_cpu >= nr_cpu_ids) { - smp_call_function_single(cpu, func, info, wait); + if (!cond_func || (cond_func && cond_func(cpu, info))) + smp_call_function_single(cpu, func, info, wait); return; } @@ -465,6 +453,9 @@ void smp_call_function_many(const struct cpumask *mask, for_each_cpu(cpu, cfd->cpumask) { call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu); + if (cond_func && !cond_func(cpu, info)) + continue; + csd_lock(csd); if (wait) csd->flags |= CSD_FLAG_SYNCHRONOUS; @@ -486,6 +477,26 @@ void smp_call_function_many(const struct cpumask *mask, } } } + +/** + * smp_call_function_many(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on (only runs on online subset). + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. + * + * If @wait is true, then returns once @func has returned. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. Preemption + * must be disabled when calling this function. + */ +void smp_call_function_many(const struct cpumask *mask, + smp_call_func_t func, void *info, bool wait) +{ + smp_call_function_many_cond(mask, func, info, wait, NULL); +} EXPORT_SYMBOL(smp_call_function_many); /** @@ -684,33 +695,17 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, gfp_t gfp_flags, const struct cpumask *mask) { - cpumask_var_t cpus; - int cpu, ret; - - might_sleep_if(gfpflags_allow_blocking(gfp_flags)); - - if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { - preempt_disable(); - for_each_cpu(cpu, mask) - if (cond_func(cpu, info)) - __cpumask_set_cpu(cpu, cpus); - on_each_cpu_mask(cpus, func, info, wait); - preempt_enable(); - free_cpumask_var(cpus); - } else { - /* - * No free cpumask, bother. No matter, we'll - * just have to IPI them one by one. - */ - preempt_disable(); - for_each_cpu(cpu, mask) - if (cond_func(cpu, info)) { - ret = smp_call_function_single(cpu, func, - info, wait); - WARN_ON_ONCE(ret); - } - preempt_enable(); + int cpu = get_cpu(); + + smp_call_function_many_cond(mask, func, info, wait, cond_func); + if (cpumask_test_cpu(cpu, mask) && cond_func(cpu, info)) { + unsigned long flags; + + local_irq_save(flags); + func(info); + local_irq_restore(flags); } + put_cpu(); } EXPORT_SYMBOL(on_each_cpu_cond_mask); -- cgit v1.2.3 From cb923159bbb8cc8fe09c19a3435ee11fd546f3d3 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 17 Jan 2020 10:01:37 +0100 Subject: smp: Remove allocation mask from on_each_cpu_cond.*() The allocation mask is no longer used by on_each_cpu_cond() and on_each_cpu_cond_mask() and can be removed. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20200117090137.1205765-4-bigeasy@linutronix.de --- kernel/smp.c | 13 +++---------- kernel/up.c | 7 +++---- 2 files changed, 6 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index e17e6344ab54..3b7bedc97af3 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -679,11 +679,6 @@ EXPORT_SYMBOL(on_each_cpu_mask); * @info: An arbitrary pointer to pass to both functions. * @wait: If true, wait (atomically) until function has * completed on other CPUs. - * @gfp_flags: GFP flags to use when allocating the cpumask - * used internally by the function. - * - * The function might sleep if the GFP flags indicates a non - * atomic allocation is allowed. * * Preemption is disabled to protect against CPUs going offline but not online. * CPUs going online during the call will not be seen or sent an IPI. @@ -692,8 +687,7 @@ EXPORT_SYMBOL(on_each_cpu_mask); * from a hardware interrupt handler or from a bottom half handler. */ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, - void *info, bool wait, gfp_t gfp_flags, - const struct cpumask *mask) + void *info, bool wait, const struct cpumask *mask) { int cpu = get_cpu(); @@ -710,10 +704,9 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, EXPORT_SYMBOL(on_each_cpu_cond_mask); void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, - void *info, bool wait, gfp_t gfp_flags) + void *info, bool wait) { - on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, - cpu_online_mask); + on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask); } EXPORT_SYMBOL(on_each_cpu_cond); diff --git a/kernel/up.c b/kernel/up.c index 5c0d4f2bece2..53144d056252 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -69,8 +69,7 @@ EXPORT_SYMBOL(on_each_cpu_mask); * same condtions in UP and SMP. */ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, - void *info, bool wait, gfp_t gfp_flags, - const struct cpumask *mask) + void *info, bool wait, const struct cpumask *mask) { unsigned long flags; @@ -85,9 +84,9 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, EXPORT_SYMBOL(on_each_cpu_cond_mask); void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, - void *info, bool wait, gfp_t gfp_flags) + void *info, bool wait) { - on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL); + on_each_cpu_cond_mask(cond_func, func, info, wait, NULL); } EXPORT_SYMBOL(on_each_cpu_cond); -- cgit v1.2.3 From 6b088cefbeaa87ba48bf838edfc1e19c9ff3976b Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 23 Jan 2020 21:58:49 -0800 Subject: alarmtimer: Update alarmtimer_get_rtcdev() docs to reflect reality This function doesn't do anything like this comment says when an RTC device hasn't been chosen. It looks like we used to do something like that before commit 8bc0dafb5cf3 ("alarmtimers: Rework RTC device selection using class interface") but that's long gone now. Remove this sentence to avoid confusing the reader. Signed-off-by: Stephen Boyd Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200124055849.154411-5-swboyd@chromium.org --- kernel/time/alarmtimer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 9dc7a0913190..564ff5df2b0b 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -69,8 +69,6 @@ static DEFINE_SPINLOCK(rtcdev_lock); * alarmtimer_get_rtcdev - Return selected rtcdevice * * This function returns the rtc device to use for wakealarms. - * If one has not already been chosen, it checks to see if a - * functional rtc device is available. */ struct rtc_device *alarmtimer_get_rtcdev(void) { -- cgit v1.2.3 From c79108bd19a8490315847e0c95ac6526fcd8e770 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 23 Jan 2020 21:58:46 -0800 Subject: alarmtimer: Make alarmtimer platform device child of RTC device The alarmtimer_suspend() function will fail if an RTC device is on a bus such as SPI or i2c and that RTC device registers and probes after alarmtimer_init() registers and probes the 'alarmtimer' platform device. This is because system wide suspend suspends devices in the reverse order of their probe. When alarmtimer_suspend() attempts to program the RTC for a wakeup it will try to program an RTC device on a bus that has already been suspended. Move the alarmtimer device registration to happen when the RTC which is used for wakeup is registered. Register the 'alarmtimer' platform device as a child of the RTC device too, so that it can be guaranteed that the RTC device won't be suspended when alarmtimer_suspend() is called. Reported-by: Douglas Anderson Signed-off-by: Stephen Boyd Signed-off-by: Thomas Gleixner Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20200124055849.154411-2-swboyd@chromium.org --- kernel/time/alarmtimer.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 564ff5df2b0b..f0469ccc84ee 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -89,6 +89,7 @@ static int alarmtimer_rtc_add_device(struct device *dev, unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); struct wakeup_source *__ws; + struct platform_device *pdev; int ret = 0; if (rtcdev) @@ -100,9 +101,11 @@ static int alarmtimer_rtc_add_device(struct device *dev, return -1; __ws = wakeup_source_register(dev, "alarmtimer"); + pdev = platform_device_register_data(dev, "alarmtimer", + PLATFORM_DEVID_AUTO, NULL, 0); spin_lock_irqsave(&rtcdev_lock, flags); - if (!rtcdev) { + if (__ws && !IS_ERR(pdev) && !rtcdev) { if (!try_module_get(rtc->owner)) { ret = -1; goto unlock; @@ -113,10 +116,14 @@ static int alarmtimer_rtc_add_device(struct device *dev, get_device(dev); ws = __ws; __ws = NULL; + pdev = NULL; + } else { + ret = -1; } unlock: spin_unlock_irqrestore(&rtcdev_lock, flags); + platform_device_unregister(pdev); wakeup_source_unregister(__ws); return ret; @@ -903,8 +910,7 @@ static void get_boottime_timespec(struct timespec64 *tp) */ static int __init alarmtimer_init(void) { - struct platform_device *pdev; - int error = 0; + int error; int i; alarmtimer_rtc_timer_init(); @@ -929,15 +935,7 @@ static int __init alarmtimer_init(void) if (error) goto out_if; - pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0); - if (IS_ERR(pdev)) { - error = PTR_ERR(pdev); - goto out_drv; - } return 0; - -out_drv: - platform_driver_unregister(&alarmtimer_driver); out_if: alarmtimer_rtc_interface_remove(); return error; -- cgit v1.2.3 From 7c94caca877b0feeca6f5f7b07d48c508e20d58f Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 23 Jan 2020 21:58:47 -0800 Subject: alarmtimer: Use wakeup source from alarmtimer platform device Use the wakeup source that can be associated with the 'alarmtimer' platform device instead of registering another one by hand. Signed-off-by: Stephen Boyd Signed-off-by: Thomas Gleixner Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20200124055849.154411-3-swboyd@chromium.org --- kernel/time/alarmtimer.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index f0469ccc84ee..685ff57a1d87 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -58,8 +58,6 @@ static DEFINE_SPINLOCK(freezer_delta_lock); #endif #ifdef CONFIG_RTC_CLASS -static struct wakeup_source *ws; - /* rtc timer and device for setting alarm wakeups at suspend */ static struct rtc_timer rtctimer; static struct rtc_device *rtcdev; @@ -88,7 +86,6 @@ static int alarmtimer_rtc_add_device(struct device *dev, { unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); - struct wakeup_source *__ws; struct platform_device *pdev; int ret = 0; @@ -100,12 +97,13 @@ static int alarmtimer_rtc_add_device(struct device *dev, if (!device_may_wakeup(rtc->dev.parent)) return -1; - __ws = wakeup_source_register(dev, "alarmtimer"); pdev = platform_device_register_data(dev, "alarmtimer", PLATFORM_DEVID_AUTO, NULL, 0); + if (!IS_ERR(pdev)) + device_init_wakeup(&pdev->dev, true); spin_lock_irqsave(&rtcdev_lock, flags); - if (__ws && !IS_ERR(pdev) && !rtcdev) { + if (!IS_ERR(pdev) && !rtcdev) { if (!try_module_get(rtc->owner)) { ret = -1; goto unlock; @@ -114,8 +112,6 @@ static int alarmtimer_rtc_add_device(struct device *dev, rtcdev = rtc; /* hold a reference so it doesn't go away */ get_device(dev); - ws = __ws; - __ws = NULL; pdev = NULL; } else { ret = -1; @@ -124,7 +120,6 @@ unlock: spin_unlock_irqrestore(&rtcdev_lock, flags); platform_device_unregister(pdev); - wakeup_source_unregister(__ws); return ret; } @@ -291,7 +286,7 @@ static int alarmtimer_suspend(struct device *dev) return 0; if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { - __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); + pm_wakeup_event(dev, 2 * MSEC_PER_SEC); return -EBUSY; } @@ -306,7 +301,7 @@ static int alarmtimer_suspend(struct device *dev) /* Set alarm, if in the past reject suspend briefly to handle */ ret = rtc_timer_start(rtc, &rtctimer, now, 0); if (ret < 0) - __pm_wakeup_event(ws, MSEC_PER_SEC); + pm_wakeup_event(dev, MSEC_PER_SEC); return ret; } -- cgit v1.2.3 From fd928f3e32ba09381b287f8b732418434d932855 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 23 Jan 2020 21:58:48 -0800 Subject: alarmtimer: Make alarmtimer_get_rtcdev() a stub when CONFIG_RTC_CLASS=n The stubbed version of alarmtimer_get_rtcdev() is not exported. so this won't work if this function is used in a module when CONFIG_RTC_CLASS=n. Move the stub function to the header file and make it inline so that callers don't have to worry about linking against this symbol. rtcdev isn't used outside of this ifdef so it's not required to be redefined to NULL. Drop that while touching this area. Signed-off-by: Stephen Boyd Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200124055849.154411-4-swboyd@chromium.org --- kernel/time/alarmtimer.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 685ff57a1d87..2ffb466af77e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -143,11 +143,6 @@ static void alarmtimer_rtc_interface_remove(void) class_interface_unregister(&alarmtimer_rtc_interface); } #else -struct rtc_device *alarmtimer_get_rtcdev(void) -{ - return NULL; -} -#define rtcdev (NULL) static inline int alarmtimer_rtc_interface_setup(void) { return 0; } static inline void alarmtimer_rtc_interface_remove(void) { } static inline void alarmtimer_rtc_timer_init(void) { } -- cgit v1.2.3 From dfb6cd1e654315168e36d947471bd2a0ccd834ae Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 24 Jan 2020 17:47:49 -0500 Subject: tracing: Fix very unlikely race of registering two stat tracers Looking through old emails in my INBOX, I came across a patch from Luis Henriques that attempted to fix a race of two stat tracers registering the same stat trace (extremely unlikely, as this is done in the kernel, and probably doesn't even exist). The submitted patch wasn't quite right as it needed to deal with clean up a bit better (if two stat tracers were the same, it would have the same files). But to make the code cleaner, all we needed to do is to keep the all_stat_sessions_mutex held for most of the registering function. Link: http://lkml.kernel.org/r/1410299375-20068-1-git-send-email-luis.henriques@canonical.com Fixes: 002bb86d8d42f ("tracing/ftrace: separate events tracing and stats tracing engine") Reported-by: Luis Henriques Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stat.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 874f1274cf99..da8a38c3d5e4 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -304,7 +304,7 @@ static int init_stat_file(struct stat_session *session) int register_stat_tracer(struct tracer_stat *trace) { struct stat_session *session, *node; - int ret; + int ret = -EINVAL; if (!trace) return -EINVAL; @@ -315,17 +315,15 @@ int register_stat_tracer(struct tracer_stat *trace) /* Already registered? */ mutex_lock(&all_stat_sessions_mutex); list_for_each_entry(node, &all_stat_sessions, session_list) { - if (node->ts == trace) { - mutex_unlock(&all_stat_sessions_mutex); - return -EINVAL; - } + if (node->ts == trace) + goto out; } - mutex_unlock(&all_stat_sessions_mutex); + ret = -ENOMEM; /* Init the session */ session = kzalloc(sizeof(*session), GFP_KERNEL); if (!session) - return -ENOMEM; + goto out; session->ts = trace; INIT_LIST_HEAD(&session->session_list); @@ -334,15 +332,16 @@ int register_stat_tracer(struct tracer_stat *trace) ret = init_stat_file(session); if (ret) { destroy_session(session); - return ret; + goto out; } + ret = 0; /* Register */ - mutex_lock(&all_stat_sessions_mutex); list_add_tail(&session->session_list, &all_stat_sessions); + out: mutex_unlock(&all_stat_sessions_mutex); - return 0; + return ret; } void unregister_stat_tracer(struct tracer_stat *trace) -- cgit v1.2.3 From afccc00f75bbbee4e4ae833a96c2d29a7259c693 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 9 Sep 2014 22:49:41 +0100 Subject: tracing: Fix tracing_stat return values in error handling paths tracing_stat_init() was always returning '0', even on the error paths. It now returns -ENODEV if tracing_init_dentry() fails or -ENOMEM if it fails to created the 'trace_stat' debugfs directory. Link: http://lkml.kernel.org/r/1410299381-20108-1-git-send-email-luis.henriques@canonical.com Fixes: ed6f1c996bfe4 ("tracing: Check return value of tracing_init_dentry()") Signed-off-by: Luis Henriques [ Pulled from the archeological digging of my INBOX ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stat.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index da8a38c3d5e4..d1fa19773cc8 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -280,18 +280,22 @@ static int tracing_stat_init(void) d_tracing = tracing_init_dentry(); if (IS_ERR(d_tracing)) - return 0; + return -ENODEV; stat_dir = tracefs_create_dir("trace_stat", d_tracing); - if (!stat_dir) + if (!stat_dir) { pr_warn("Could not create tracefs 'trace_stat' entry\n"); + return -ENOMEM; + } return 0; } static int init_stat_file(struct stat_session *session) { - if (!stat_dir && tracing_stat_init()) - return -ENODEV; + int ret; + + if (!stat_dir && (ret = tracing_stat_init())) + return ret; session->file = tracefs_create_file(session->ts->name, 0644, stat_dir, -- cgit v1.2.3 From cbc3b92ce037f5e7536f6db157d185cd8b8f615c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Sep 2014 16:14:12 -0400 Subject: tracing: Set kernel_stack's caller size properly I noticed when trying to use the trace-cmd python interface that reading the raw buffer wasn't working for kernel_stack events. This is because it uses a stubbed version of __dynamic_array that doesn't do the __data_loc trick and encode the length of the array into the field. Instead it just shows up as a size of 0. So change this to __array and set the len to FTRACE_STACK_ENTRIES since this is what we actually do in practice and matches how user_stack_trace works. Link: http://lkml.kernel.org/r/1411589652-1318-1-git-send-email-jbacik@fb.com Signed-off-by: Josef Bacik [ Pulled from the archeological digging of my INBOX ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_entries.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index fc8e97328e54..78c146efb862 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -174,7 +174,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry, F_STRUCT( __field( int, size ) - __dynamic_array(unsigned long, caller ) + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) ), F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n" -- cgit v1.2.3 From b3f7a6cd490112085eadb578b6f4a5a34d140726 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 22 Nov 2014 21:30:12 +0300 Subject: tracing: Remove unneeded NULL check We checked "iter->trace" earlier so there is no need to check here. Link: http://lkml.kernel.org/r/20141122183012.GB6994@mwanda Signed-off-by: Dan Carpenter [ Pulled from the archeological digging of my INBOX ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d1410b4462ac..6fed9b0a8d58 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4224,7 +4224,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) mutex_init(&iter->mutex); /* Notify the tracer early; before we stop tracing. */ - if (iter->trace && iter->trace->open) + if (iter->trace->open) iter->trace->open(iter); /* Annotate start of buffers if we had overruns */ -- cgit v1.2.3 From 28394da25888168df379c40910591b95e8e449f7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 24 Jan 2020 20:47:46 -0500 Subject: tracing: Decrement trace_array when bootconfig creates an instance The trace_array_get_by_name() creates a ftrace instance and trace_array_put() is used to remove the reference. Even though the trace_array_get_by_name() creates the instance, it also adds a reference count to it, that prevents user space from removing it. As the bootconfig just creates the instance on boot up, it should still be used where it can be deleted by user space after boot. A trace_array_put() is required to let that happen. Also, change the documentation on trace_array_get_by_name() to make this not be so confusing. Link: https://lore.kernel.org/r/20200124205927.76128804@rorschach.local.home Fixes: 4f712a4d04a4e ("tracing/boot: Add instance node support") Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 ++++ kernel/trace/trace_boot.c | 1 + 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6fed9b0a8d58..0a5569b1cace 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8602,6 +8602,10 @@ out_unlock: * NOTE: This function increments the reference counter associated with the * trace array returned. This makes sure it cannot be freed while in use. * Use trace_array_put() once the trace array is no longer needed. + * If the trace_array is to be freed, trace_array_destroy() needs to + * be called after the trace_array_put(), or simply let user space delete + * it from the tracefs instances directory. But until the + * trace_array_put() is called, user space can not delete it. * */ struct trace_array *trace_array_get_by_name(const char *name) diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index cd541ac1cbc1..2f616cd926b0 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -327,6 +327,7 @@ trace_boot_init_instances(struct xbc_node *node) continue; } trace_boot_init_one_instance(tr, inode); + trace_array_put(tr); } } -- cgit v1.2.3 From 84ad7a7ab69f112c0c4b878c9be91b950a1fb1f8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 23 Jan 2020 17:15:06 +0100 Subject: bpf: Allow BTF ctx access for string pointers When accessing the context we allow access to arguments with scalar type and pointer to struct. But we deny access for pointer to scalar type, which is the case for many functions. Alexei suggested to take conservative approach and allow currently only string pointer access, which is the case for most functions now: Adding check if the pointer is to string type and allow access to it. Suggested-by: Alexei Starovoitov Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200123161508.915203-2-jolsa@kernel.org --- kernel/bpf/btf.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 32963b6d5a9c..b7c1660fb594 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3669,6 +3669,19 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) } } +static bool is_string_ptr(struct btf *btf, const struct btf_type *t) +{ + /* t comes in already as a pointer */ + t = btf_type_by_id(btf, t->type); + + /* allow const */ + if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST) + t = btf_type_by_id(btf, t->type); + + /* char, signed char, unsigned char */ + return btf_type_is_int(t) && t->size == 1; +} + bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -3735,6 +3748,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, */ return true; + if (is_string_ptr(btf, t)) + return true; + /* this is a pointer to another type */ info->reg_type = PTR_TO_BTF_ID; -- cgit v1.2.3 From e9b4e606c2289d6610113253922bb8c9ac7f68b0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 23 Jan 2020 17:15:07 +0100 Subject: bpf: Allow to resolve bpf trampoline and dispatcher in unwind When unwinding the stack we need to identify each address to successfully continue. Adding latch tree to keep trampolines for quick lookup during the unwind. The patch uses first 48 bytes for latch tree node, leaving 4048 bytes from the rest of the page for trampoline or dispatcher generated code. It's still enough not to affect trampoline and dispatcher progs maximum counts. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200123161508.915203-3-jolsa@kernel.org --- kernel/bpf/dispatcher.c | 4 +-- kernel/bpf/trampoline.c | 80 ++++++++++++++++++++++++++++++++++++++++++++----- kernel/extable.c | 7 +++-- 3 files changed, 79 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index 204ee61a3904..b3e5b214fed8 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -113,7 +113,7 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) noff = 0; } else { old = d->image + d->image_off; - noff = d->image_off ^ (PAGE_SIZE / 2); + noff = d->image_off ^ (BPF_IMAGE_SIZE / 2); } new = d->num_progs ? d->image + noff : NULL; @@ -140,7 +140,7 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, mutex_lock(&d->mutex); if (!d->image) { - d->image = bpf_jit_alloc_exec_page(); + d->image = bpf_image_alloc(); if (!d->image) goto out; } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index eb64c245052b..6b264a92064b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -4,6 +4,7 @@ #include #include #include +#include /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -16,11 +17,12 @@ const struct bpf_prog_ops bpf_extension_prog_ops = { #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; +static struct latch_tree_root image_tree __cacheline_aligned; -/* serializes access to trampoline_table */ +/* serializes access to trampoline_table and image_tree */ static DEFINE_MUTEX(trampoline_mutex); -void *bpf_jit_alloc_exec_page(void) +static void *bpf_jit_alloc_exec_page(void) { void *image; @@ -36,6 +38,64 @@ void *bpf_jit_alloc_exec_page(void) return image; } +static __always_inline bool image_tree_less(struct latch_tree_node *a, + struct latch_tree_node *b) +{ + struct bpf_image *ia = container_of(a, struct bpf_image, tnode); + struct bpf_image *ib = container_of(b, struct bpf_image, tnode); + + return ia < ib; +} + +static __always_inline int image_tree_comp(void *addr, struct latch_tree_node *n) +{ + void *image = container_of(n, struct bpf_image, tnode); + + if (addr < image) + return -1; + if (addr >= image + PAGE_SIZE) + return 1; + + return 0; +} + +static const struct latch_tree_ops image_tree_ops = { + .less = image_tree_less, + .comp = image_tree_comp, +}; + +static void *__bpf_image_alloc(bool lock) +{ + struct bpf_image *image; + + image = bpf_jit_alloc_exec_page(); + if (!image) + return NULL; + + if (lock) + mutex_lock(&trampoline_mutex); + latch_tree_insert(&image->tnode, &image_tree, &image_tree_ops); + if (lock) + mutex_unlock(&trampoline_mutex); + return image->data; +} + +void *bpf_image_alloc(void) +{ + return __bpf_image_alloc(true); +} + +bool is_bpf_image_address(unsigned long addr) +{ + bool ret; + + rcu_read_lock(); + ret = latch_tree_find((void *) addr, &image_tree, &image_tree_ops) != NULL; + rcu_read_unlock(); + + return ret; +} + struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; @@ -56,7 +116,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) goto out; /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ - image = bpf_jit_alloc_exec_page(); + image = __bpf_image_alloc(false); if (!image) { kfree(tr); tr = NULL; @@ -131,14 +191,14 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) } /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 - * bytes on x86. Pick a number to fit into PAGE_SIZE / 2 + * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 */ #define BPF_MAX_TRAMP_PROGS 40 static int bpf_trampoline_update(struct bpf_trampoline *tr) { - void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2; - void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2; + void *old_image = tr->image + ((tr->selector + 1) & 1) * BPF_IMAGE_SIZE/2; + void *new_image = tr->image + (tr->selector & 1) * BPF_IMAGE_SIZE/2; struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS]; int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY]; int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT]; @@ -174,7 +234,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) */ synchronize_rcu_tasks(); - err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, + err = arch_prepare_bpf_trampoline(new_image, new_image + BPF_IMAGE_SIZE / 2, &tr->func.model, flags, fentry, fentry_cnt, fexit, fexit_cnt, @@ -284,6 +344,8 @@ out: void bpf_trampoline_put(struct bpf_trampoline *tr) { + struct bpf_image *image; + if (!tr) return; mutex_lock(&trampoline_mutex); @@ -294,9 +356,11 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) goto out; if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; + image = container_of(tr->image, struct bpf_image, data); + latch_tree_erase(&image->tnode, &image_tree, &image_tree_ops); /* wait for tasks to get out of trampoline before freeing it */ synchronize_rcu_tasks(); - bpf_jit_free_exec(tr->image); + bpf_jit_free_exec(image); hlist_del(&tr->hlist); kfree(tr); out: diff --git a/kernel/extable.c b/kernel/extable.c index f6920a11e28a..a0024f27d3a1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -131,8 +131,9 @@ int kernel_text_address(unsigned long addr) * triggers a stack trace, or a WARN() that happens during * coming back from idle, or cpu on or offlining. * - * is_module_text_address() as well as the kprobe slots - * and is_bpf_text_address() require RCU to be watching. + * is_module_text_address() as well as the kprobe slots, + * is_bpf_text_address() and is_bpf_image_address require + * RCU to be watching. */ no_rcu = !rcu_is_watching(); @@ -148,6 +149,8 @@ int kernel_text_address(unsigned long addr) goto out; if (is_bpf_text_address(addr)) goto out; + if (is_bpf_image_address(addr)) + goto out; ret = 0; out: if (no_rcu) -- cgit v1.2.3 From 24589e3a20876dc07c62f45c8f8f8266dd39ba38 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sat, 25 Jan 2020 10:52:30 -0500 Subject: tracing: Use pr_err() instead of WARN() for memory failures As warnings can trigger panics, especially when "panic_on_warn" is set, memory failure warnings can cause panics and fail fuzz testers that are stressing memory. Create a MEM_FAIL() macro to use instead of WARN() in the tracing code (perhaps this should be a kernel wide macro?), and use that for memory failure issues. This should stop failing fuzz tests due to warnings. Link: https://lore.kernel.org/r/CACT4Y+ZP-7np20GVRu3p+eZys9GPtbu+JpfV+HtsufAzvTgJrg@mail.gmail.com Suggested-by: Dmitry Vyukov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ++-- kernel/trace/trace.c | 18 +++++++++--------- kernel/trace/trace.h | 12 ++++++++++++ 3 files changed, 23 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5c701765da5b..fdb1a9532420 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5459,7 +5459,7 @@ static void __init set_ftrace_early_graph(char *buf, int enable) struct ftrace_hash *hash; hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); - if (WARN_ON(!hash)) + if (MEM_FAIL(!hash, "Failed to allocate hash\n")) return; while (buf) { @@ -6591,7 +6591,7 @@ static void add_to_clear_hash_list(struct list_head *clear_list, func = kmalloc(sizeof(*func), GFP_KERNEL); if (!func) { - WARN_ONCE(1, "alloc failure, ftrace filter could be stale\n"); + MEM_FAIL(1, "alloc failure, ftrace filter could be stale\n"); return; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0a5569b1cace..6a28b1b9bf42 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3126,7 +3126,7 @@ static int alloc_percpu_trace_buffer(void) struct trace_buffer_struct *buffers; buffers = alloc_percpu(struct trace_buffer_struct); - if (WARN(!buffers, "Could not allocate percpu trace_printk buffer")) + if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer")) return -ENOMEM; trace_percpu_buffer = buffers; @@ -7932,7 +7932,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer); - WARN_ONCE(!tr->percpu_dir, + MEM_FAIL(!tr->percpu_dir, "Could not create tracefs directory 'per_cpu/%d'\n", cpu); return tr->percpu_dir; @@ -8253,7 +8253,7 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) for (cnt = 0; opts[cnt].name; cnt++) { create_trace_option_file(tr, &topts[cnt], flags, &opts[cnt]); - WARN_ONCE(topts[cnt].entry == NULL, + MEM_FAIL(topts[cnt].entry == NULL, "Failed to create trace option: %s", opts[cnt].name); } @@ -8437,7 +8437,7 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) #ifdef CONFIG_TRACER_MAX_TRACE ret = allocate_trace_buffer(tr, &tr->max_buffer, allocate_snapshot ? size : 1); - if (WARN_ON(ret)) { + if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) { ring_buffer_free(tr->array_buffer.buffer); tr->array_buffer.buffer = NULL; free_percpu(tr->array_buffer.data); @@ -8726,7 +8726,7 @@ static __init void create_trace_instances(struct dentry *d_tracer) trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer, instance_mkdir, instance_rmdir); - if (WARN_ON(!trace_instance_dir)) + if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n")) return; } @@ -8796,7 +8796,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) #endif if (ftrace_create_function_files(tr, d_tracer)) - WARN(1, "Could not allocate function filter files"); + MEM_FAIL(1, "Could not allocate function filter files"); #ifdef CONFIG_TRACER_SNAPSHOT trace_create_file("snapshot", 0644, d_tracer, @@ -9348,8 +9348,7 @@ __init static int tracer_alloc_buffers(void) /* TODO: make the number of buffers hot pluggable with CPUS */ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { - printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); - WARN_ON(1); + MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n"); goto out_free_savedcmd; } @@ -9422,7 +9421,8 @@ void __init early_trace_init(void) if (tracepoint_printk) { tracepoint_print_iter = kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); - if (WARN_ON(!tracepoint_print_iter)) + if (MEM_FAIL(!tracepoint_print_iter, + "Failed to allocate trace iterator\n")) tracepoint_printk = 0; else static_key_enable(&tracepoint_printk_key.key); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4812a36affac..6bb64d89c321 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -94,6 +94,18 @@ enum trace_type { #include "trace_entries.h" +/* Use this for memory failure errors */ +#define MEM_FAIL(condition, fmt, ...) ({ \ + static bool __section(.data.once) __warned; \ + int __ret_warn_once = !!(condition); \ + \ + if (unlikely(__ret_warn_once && !__warned)) { \ + __warned = true; \ + pr_err("ERROR: " fmt, ##__VA_ARGS__); \ + } \ + unlikely(__ret_warn_once); \ +}) + /* * syscalls are special, and need special handling, this is why * they are not included in trace_entries.h -- cgit v1.2.3 From 59d8cc6b2e375ff486b030da6703b1d481e186e6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 25 Jan 2020 12:00:40 -0800 Subject: rcu: Forgive slow expedited grace periods at boot time Boot-time processing often loops in the kernel longer than one might prefer, which can prevent expedited grace periods from completing in a timely manner. This in turn triggers a splat In nohz_full CPUs One could argue that long-looping code should be fixed, but on the other hand, boot time is a bit special. This commit therefore removes the splat. Later commits will add the splat back in, but in a way that removes false positives. Reported-by: Borislav Petkov Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6935a9e2b094..dcbd75791f39 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -508,7 +508,6 @@ static void synchronize_rcu_expedited_wait(void) tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); } } - WARN_ON_ONCE(1); } for (;;) { -- cgit v1.2.3 From 913292c97d750fe4188b4f5aa770e5e0ca1e5a91 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Fri, 24 Jan 2020 10:29:08 +0530 Subject: sched.h: Annotate sighand_struct with __rcu This patch fixes the following sparse errors by annotating the sighand_struct with __rcu kernel/fork.c:1511:9: error: incompatible types in comparison expression kernel/exit.c:100:19: error: incompatible types in comparison expression kernel/signal.c:1370:27: error: incompatible types in comparison expression This fix introduces the following sparse error in signal.c due to checking the sighand pointer without rcu primitives: kernel/signal.c:1386:21: error: incompatible types in comparison expression This new sparse error is also fixed in this patch. Signed-off-by: Madhuparna Bhowmik Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20200124045908.26389-1-madhuparnabhowmik10@gmail.com Signed-off-by: Christian Brauner --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index bcd46f547db3..9ad8dea93dbb 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1383,7 +1383,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, * must see ->sighand == NULL. */ spin_lock_irqsave(&sighand->siglock, *flags); - if (likely(sighand == tsk->sighand)) + if (likely(sighand == rcu_access_pointer(tsk->sighand))) break; spin_unlock_irqrestore(&sighand->siglock, *flags); } -- cgit v1.2.3 From 90435a7891a2259b0f74c5a1bc5600d0d64cba8f Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sat, 25 Jan 2020 12:10:02 +0300 Subject: bpf: map_seq_next should always increase position index If seq_file .next fuction does not change position index, read after some lseek can generate an unexpected output. See also: https://bugzilla.kernel.org/show_bug.cgi?id=206283 v1 -> v2: removed missed increment in end of function Signed-off-by: Vasily Averin Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/eca84fdd-c374-a154-d874-6c7b55fc3bc4@virtuozzo.com --- kernel/bpf/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index e11059b99f18..bd2fd8eab470 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -196,6 +196,7 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) void *key = map_iter(m)->key; void *prev_key; + (*pos)++; if (map_iter(m)->done) return NULL; @@ -208,8 +209,6 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) map_iter(m)->done = true; return NULL; } - - ++(*pos); return key; } -- cgit v1.2.3 From 42a84a8cd0ff0cbff5a4595e1304c4567a30267d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 26 Jan 2020 16:14:00 -0800 Subject: bpf, xdp: Update devmap comments to reflect napi/rcu usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we rely on synchronize_rcu and call_rcu waiting to exit perempt-disable regions (NAPI) lets update the comments to reflect this. Fixes: 0536b85239b84 ("xdp: Simplify devmap cleanup") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Acked-by: Song Liu Link: https://lore.kernel.org/bpf/1580084042-11598-2-git-send-email-john.fastabend@gmail.com --- kernel/bpf/devmap.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f3a44f6ca86e..373c6a30d8a5 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -190,10 +190,12 @@ static void dev_map_free(struct bpf_map *map) /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete. The rcu critical section only guarantees - * no further reads against netdev_map. It does __not__ ensure pending - * flush operations (if any) are complete. + * disconnected from events. The following synchronize_rcu() guarantees + * both rcu read critical sections complete and waits for + * preempt-disable regions (NAPI being the relevant context here) so we + * are certain there will be no further reads against the netdev_map and + * all flush operations are complete. Flush operations can only be done + * from NAPI context for this reason. */ spin_lock(&dev_map_lock); @@ -503,12 +505,11 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return -EINVAL; /* Use call_rcu() here to ensure any rcu critical sections have - * completed, but this does not guarantee a flush has happened - * yet. Because driver side rcu_read_lock/unlock only protects the - * running XDP program. However, for pending flush operations the - * dev and ctx are stored in another per cpu map. And additionally, - * the driver tear down ensures all soft irqs are complete before - * removing the net device in the case of dev_put equals zero. + * completed as well as any flush operations because call_rcu + * will wait for preempt-disable region to complete, NAPI in this + * context. And additionally, the driver tear down ensures all + * soft irqs are complete before removing the net device in the + * case of dev_put equals zero. */ old_dev = xchg(&dtab->netdev_map[k], NULL); if (old_dev) -- cgit v1.2.3 From b23bfa5633b19bf1db87b36a76b2225c734f794c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 26 Jan 2020 16:14:02 -0800 Subject: bpf, xdp: Remove no longer required rcu_read_{un}lock() Now that we depend on rcu_call() and synchronize_rcu() to also wait for preempt_disabled region to complete the rcu read critical section in __dev_map_flush() is no longer required. Except in a few special cases in drivers that need it for other reasons. These originally ensured the map reference was safe while a map was also being free'd. And additionally that bpf program updates via ndo_bpf did not happen while flush updates were in flight. But flush by new rules can only be called from preempt-disabled NAPI context. The synchronize_rcu from the map free path and the rcu_call from the delete path will ensure the reference there is safe. So lets remove the rcu_read_lock and rcu_read_unlock pair to avoid any confusion around how this is being protected. If the rcu_read_lock was required it would mean errors in the above logic and the original patch would also be wrong. Now that we have done above we put the rcu_read_lock in the driver code where it is needed in a driver dependent way. I think this helps readability of the code so we know where and why we are taking read locks. Most drivers will not need rcu_read_locks here and further XDP drivers already have rcu_read_locks in their code paths for reading xdp programs on RX side so this makes it symmetric where we don't have half of rcu critical sections define in driver and the other half in devmap. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/1580084042-11598-4-git-send-email-john.fastabend@gmail.com --- kernel/bpf/devmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 373c6a30d8a5..58bdca5d978a 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -366,16 +366,17 @@ error: * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the * net device can be torn down. On devmap tear down we ensure the flush list * is empty before completing to ensure all flush operations have completed. + * When drivers update the bpf program they may need to ensure any flush ops + * are also complete. Using synchronize_rcu or call_rcu will suffice for this + * because both wait for napi context to exit. */ void __dev_flush(void) { struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq, *tmp; - rcu_read_lock(); list_for_each_entry_safe(bq, tmp, flush_list, flush_node) bq_xmit_all(bq, XDP_XMIT_FLUSH); - rcu_read_unlock(); } /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or -- cgit v1.2.3 From 20279420ae3a8ef4c5d9fedc360a2c37a1dbdf1b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 24 Jan 2020 10:07:42 -0500 Subject: tracing/kprobes: Have uname use __get_str() in print_fmt Thomas Richter reported: > Test case 66 'Use vfs_getname probe to get syscall args filenames' > is broken on s390, but works on x86. The test case fails with: > > [root@m35lp76 perf]# perf test -F 66 > 66: Use vfs_getname probe to get syscall args filenames > :Recording open file: > [ perf record: Woken up 1 times to write data ] > [ perf record: Captured and wrote 0.004 MB /tmp/__perf_test.perf.data.TCdYj\ > (20 samples) ] > Looking at perf.data file for vfs_getname records for the file we touched: > FAILED! > [root@m35lp76 perf]# The root cause was the print_fmt of the kprobe event that referenced the "ustring" > Setting up the kprobe event using perf command: > > # ./perf probe "vfs_getname=getname_flags:72 pathname=filename:ustring" > > generates this format file: > [root@m35lp76 perf]# cat /sys/kernel/debug/tracing/events/probe/\ > vfs_getname/format > name: vfs_getname > ID: 1172 > format: > field:unsigned short common_type; offset:0; size:2; signed:0; > field:unsigned char common_flags; offset:2; size:1; signed:0; > field:unsigned char common_preempt_count; offset:3; size:1; signed:0; > field:int common_pid; offset:4; size:4; signed:1; > > field:unsigned long __probe_ip; offset:8; size:8; signed:0; > field:__data_loc char[] pathname; offset:16; size:4; signed:1; > > print fmt: "(%lx) pathname=\"%s\"", REC->__probe_ip, REC->pathname Instead of using "__get_str(pathname)" it referenced it directly. Link: http://lkml.kernel.org/r/20200124100742.4050c15e@gandalf.local.home Cc: stable@vger.kernel.org Fixes: 88903c464321 ("tracing/probe: Add ustring type for user-space string") Acked-by: Masami Hiramatsu Reported-by: Thomas Richter Tested-by: Thomas Richter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 9ae87be422f2..ab8b6436d53f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -876,7 +876,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, for (i = 0; i < tp->nr_args; i++) { parg = tp->args + i; if (parg->count) { - if (strcmp(parg->type->name, "string") == 0) + if ((strcmp(parg->type->name, "string") == 0) || + (strcmp(parg->type->name, "ustring") == 0)) fmt = ", __get_str(%s[%d])"; else fmt = ", REC->%s[%d]"; @@ -884,7 +885,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, pos += snprintf(buf + pos, LEN_OR_ZERO, fmt, parg->name, j); } else { - if (strcmp(parg->type->name, "string") == 0) + if ((strcmp(parg->type->name, "string") == 0) || + (strcmp(parg->type->name, "ustring") == 0)) fmt = ", __get_str(%s)"; else fmt = ", REC->%s"; -- cgit v1.2.3 From 8d19f1c8e1937baf74e1962aae9f90fa3aeab463 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Mon, 11 Nov 2019 18:19:00 -0600 Subject: prctl: PR_{G,S}ET_IO_FLUSHER to support controlling memory reclaim There are several storage drivers like dm-multipath, iscsi, tcmu-runner, amd nbd that have userspace components that can run in the IO path. For example, iscsi and nbd's userspace deamons may need to recreate a socket and/or send IO on it, and dm-multipath's daemon multipathd may need to send SG IO or read/write IO to figure out the state of paths and re-set them up. In the kernel these drivers have access to GFP_NOIO/GFP_NOFS and the memalloc_*_save/restore functions to control the allocation behavior, but for userspace we would end up hitting an allocation that ended up writing data back to the same device we are trying to allocate for. The device is then in a state of deadlock, because to execute IO the device needs to allocate memory, but to allocate memory the memory layers want execute IO to the device. Here is an example with nbd using a local userspace daemon that performs network IO to a remote server. We are using XFS on top of the nbd device, but it can happen with any FS or other modules layered on top of the nbd device that can write out data to free memory. Here a nbd daemon helper thread, msgr-worker-1, is performing a write/sendmsg on a socket to execute a request. This kicks off a reclaim operation which results in a WRITE to the nbd device and the nbd thread calling back into the mm layer. [ 1626.609191] msgr-worker-1 D 0 1026 1 0x00004000 [ 1626.609193] Call Trace: [ 1626.609195] ? __schedule+0x29b/0x630 [ 1626.609197] ? wait_for_completion+0xe0/0x170 [ 1626.609198] schedule+0x30/0xb0 [ 1626.609200] schedule_timeout+0x1f6/0x2f0 [ 1626.609202] ? blk_finish_plug+0x21/0x2e [ 1626.609204] ? _xfs_buf_ioapply+0x2e6/0x410 [ 1626.609206] ? wait_for_completion+0xe0/0x170 [ 1626.609208] wait_for_completion+0x108/0x170 [ 1626.609210] ? wake_up_q+0x70/0x70 [ 1626.609212] ? __xfs_buf_submit+0x12e/0x250 [ 1626.609214] ? xfs_bwrite+0x25/0x60 [ 1626.609215] xfs_buf_iowait+0x22/0xf0 [ 1626.609218] __xfs_buf_submit+0x12e/0x250 [ 1626.609220] xfs_bwrite+0x25/0x60 [ 1626.609222] xfs_reclaim_inode+0x2e8/0x310 [ 1626.609224] xfs_reclaim_inodes_ag+0x1b6/0x300 [ 1626.609227] xfs_reclaim_inodes_nr+0x31/0x40 [ 1626.609228] super_cache_scan+0x152/0x1a0 [ 1626.609231] do_shrink_slab+0x12c/0x2d0 [ 1626.609233] shrink_slab+0x9c/0x2a0 [ 1626.609235] shrink_node+0xd7/0x470 [ 1626.609237] do_try_to_free_pages+0xbf/0x380 [ 1626.609240] try_to_free_pages+0xd9/0x1f0 [ 1626.609245] __alloc_pages_slowpath+0x3a4/0xd30 [ 1626.609251] ? ___slab_alloc+0x238/0x560 [ 1626.609254] __alloc_pages_nodemask+0x30c/0x350 [ 1626.609259] skb_page_frag_refill+0x97/0xd0 [ 1626.609274] sk_page_frag_refill+0x1d/0x80 [ 1626.609279] tcp_sendmsg_locked+0x2bb/0xdd0 [ 1626.609304] tcp_sendmsg+0x27/0x40 [ 1626.609307] sock_sendmsg+0x54/0x60 [ 1626.609308] ___sys_sendmsg+0x29f/0x320 [ 1626.609313] ? sock_poll+0x66/0xb0 [ 1626.609318] ? ep_item_poll.isra.15+0x40/0xc0 [ 1626.609320] ? ep_send_events_proc+0xe6/0x230 [ 1626.609322] ? hrtimer_try_to_cancel+0x54/0xf0 [ 1626.609324] ? ep_read_events_proc+0xc0/0xc0 [ 1626.609326] ? _raw_write_unlock_irq+0xa/0x20 [ 1626.609327] ? ep_scan_ready_list.constprop.19+0x218/0x230 [ 1626.609329] ? __hrtimer_init+0xb0/0xb0 [ 1626.609331] ? _raw_spin_unlock_irq+0xa/0x20 [ 1626.609334] ? ep_poll+0x26c/0x4a0 [ 1626.609337] ? tcp_tsq_write.part.54+0xa0/0xa0 [ 1626.609339] ? release_sock+0x43/0x90 [ 1626.609341] ? _raw_spin_unlock_bh+0xa/0x20 [ 1626.609342] __sys_sendmsg+0x47/0x80 [ 1626.609347] do_syscall_64+0x5f/0x1c0 [ 1626.609349] ? prepare_exit_to_usermode+0x75/0xa0 [ 1626.609351] entry_SYSCALL_64_after_hwframe+0x44/0xa9 This patch adds a new prctl command that daemons can use after they have done their initial setup, and before they start to do allocations that are in the IO path. It sets the PF_MEMALLOC_NOIO and PF_LESS_THROTTLE flags so both userspace block and FS threads can use it to avoid the allocation recursion and try to prevent from being throttled while writing out data to free up memory. Signed-off-by: Mike Christie Acked-by: Michal Hocko Tested-by: Masato Suzuki Reviewed-by: Damien Le Moal Reviewed-by: Bart Van Assche Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20191112001900.9206-1-mchristi@redhat.com Signed-off-by: Christian Brauner --- kernel/sys.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index a9331f101883..f9bc5c303e3f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2261,6 +2261,8 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, return -EINVAL; } +#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE) + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2488,6 +2490,29 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = GET_TAGGED_ADDR_CTRL(); break; + case PR_SET_IO_FLUSHER: + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (arg3 || arg4 || arg5) + return -EINVAL; + + if (arg2 == 1) + current->flags |= PR_IO_FLUSHER; + else if (!arg2) + current->flags &= ~PR_IO_FLUSHER; + else + return -EINVAL; + break; + case PR_GET_IO_FLUSHER: + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; + break; default: error = -EINVAL; break; -- cgit v1.2.3 From 25a3a15417cf4311f812f5a2b18c5fc2809f66d7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 27 Jan 2020 09:39:15 +0100 Subject: smp: Remove superfluous cond_func check in smp_call_function_many_cond() It was requested to remove the cond_func check but the follow up patch was overlooked. Remove it now. Fixes: 67719ef25eeb ("smp: Add a smp_cond_func_t argument to smp_call_function_many()") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200127083915.434tdkztorkklpdu@linutronix.de --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 3b7bedc97af3..d0ada39eb4d4 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -435,7 +435,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, /* Fastpath: do that cpu by itself. */ if (next_cpu >= nr_cpu_ids) { - if (!cond_func || (cond_func && cond_func(cpu, info))) + if (!cond_func || cond_func(cpu, info)) smp_call_function_single(cpu, func, info, wait); return; } -- cgit v1.2.3 From 003461559ef7a9bd0239bae35a22ad8924d6e9ad Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 23 Jan 2020 10:11:46 -0800 Subject: perf/core: Fix mlock accounting in perf_mmap() Decreasing sysctl_perf_event_mlock between two consecutive perf_mmap()s of a perf ring buffer may lead to an integer underflow in locked memory accounting. This may lead to the undesired behaviors, such as failures in BPF map creation. Address this by adjusting the accounting logic to take into account the possibility that the amount of already locked memory may exceed the current limit. Fixes: c4b75479741c ("perf/core: Make the mlock accounting simple again") Suggested-by: Alexander Shishkin Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Acked-by: Alexander Shishkin Link: https://lkml.kernel.org/r/20200123181146.2238074-1-songliubraving@fb.com --- kernel/events/core.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2173c23c25b4..2d9aeba1f3e2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5916,7 +5916,15 @@ accounting: */ user_lock_limit *= num_online_cpus(); - user_locked = atomic_long_read(&user->locked_vm) + user_extra; + user_locked = atomic_long_read(&user->locked_vm); + + /* + * sysctl_perf_event_mlock may have changed, so that + * user->locked_vm > user_lock_limit + */ + if (user_locked > user_lock_limit) + user_locked = user_lock_limit; + user_locked += user_extra; if (user_locked > user_lock_limit) { /* -- cgit v1.2.3 From 07c5972951f088094776038006a0592a46d14bbc Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 22 Jan 2020 11:50:27 -0800 Subject: perf/cgroups: Install cgroup events to correct cpuctx cgroup events are always installed in the cpuctx. However, when it is not installed via IPI, list_update_cgroup_event() adds it to cpuctx of current CPU, which triggers list corruption: [] list_add double add: new=ffff888ff7cf0db0, prev=ffff888ff7ce82f0, next=ffff888ff7cf0db0. To reproduce this, we can simply run: # perf stat -e cs -a & # perf stat -e cs -G anycgroup Fix this by installing it to cpuctx that contains event->ctx, and the proper cgrp_cpuctx_list. Fixes: db0503e4f675 ("perf/core: Optimize perf_install_in_event()") Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Link: https://lkml.kernel.org/r/20200122195027.2112449-1-songliubraving@fb.com --- kernel/events/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2d9aeba1f3e2..fdb7f7ef380c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -951,9 +951,9 @@ list_update_cgroup_event(struct perf_event *event, /* * Because cgroup events are always per-cpu events, - * this will always be called from the right CPU. + * @ctx == &cpuctx->ctx. */ - cpuctx = __get_cpu_context(ctx); + cpuctx = container_of(ctx, struct perf_cpu_context, ctx); /* * Since setting cpuctx->cgrp is conditional on the current @cgrp @@ -979,7 +979,8 @@ list_update_cgroup_event(struct perf_event *event, cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; if (add) - list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); + list_add(cpuctx_entry, + per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); else list_del(cpuctx_entry); } -- cgit v1.2.3 From 488603b815a7514c7009e6fc339d74ed4a30f343 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Sat, 11 Jan 2020 04:53:38 -0500 Subject: sched/core: Don't skip remote tick for idle CPUs This will be used in the next patch to get a loadavg update from nohz cpus. The delta check is skipped because idle_sched_class doesn't update se.exec_start. Signed-off-by: Scott Wood Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/1578736419-14628-2-git-send-email-swood@redhat.com --- kernel/sched/core.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fc1dfc007604..cf8b33dc4513 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3669,22 +3669,24 @@ static void sched_tick_remote(struct work_struct *work) * statistics and checks timeslices in a time-independent way, regardless * of when exactly it is running. */ - if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) + if (!tick_nohz_tick_stopped_cpu(cpu)) goto out_requeue; rq_lock_irq(rq, &rf); curr = rq->curr; - if (is_idle_task(curr) || cpu_is_offline(cpu)) + if (cpu_is_offline(cpu)) goto out_unlock; update_rq_clock(rq); - delta = rq_clock_task(rq) - curr->se.exec_start; - /* - * Make sure the next tick runs within a reasonable - * amount of time. - */ - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + if (!is_idle_task(curr)) { + /* + * Make sure the next tick runs within a reasonable + * amount of time. + */ + delta = rq_clock_task(rq) - curr->se.exec_start; + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + } curr->sched_class->task_tick(rq, curr, 0); out_unlock: -- cgit v1.2.3 From ebc0f83c78a2d26384401ecf2d2fa48063c0ee27 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Sat, 11 Jan 2020 04:53:39 -0500 Subject: timers/nohz: Update NOHZ load in remote tick The way loadavg is tracked during nohz only pays attention to the load upon entering nohz. This can be particularly noticeable if full nohz is entered while non-idle, and then the cpu goes idle and stays that way for a long time. Use the remote tick to ensure that full nohz cpus report their deltas within a reasonable time. [ swood: Added changelog and removed recheck of stopped tick. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Scott Wood Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/1578736419-14628-3-git-send-email-swood@redhat.com --- kernel/sched/core.c | 4 +++- kernel/sched/loadavg.c | 33 +++++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cf8b33dc4513..4ff03c27779e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3677,6 +3677,7 @@ static void sched_tick_remote(struct work_struct *work) if (cpu_is_offline(cpu)) goto out_unlock; + curr = rq->curr; update_rq_clock(rq); if (!is_idle_task(curr)) { @@ -3689,10 +3690,11 @@ static void sched_tick_remote(struct work_struct *work) } curr->sched_class->task_tick(rq, curr, 0); + calc_load_nohz_remote(rq); out_unlock: rq_unlock_irq(rq, &rf); - out_requeue: + /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 28a516575c18..de22da666ac7 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -231,16 +231,11 @@ static inline int calc_load_read_idx(void) return calc_load_idx & 1; } -void calc_load_nohz_start(void) +static void calc_load_nohz_fold(struct rq *rq) { - struct rq *this_rq = this_rq(); long delta; - /* - * We're going into NO_HZ mode, if there's any pending delta, fold it - * into the pending NO_HZ delta. - */ - delta = calc_load_fold_active(this_rq, 0); + delta = calc_load_fold_active(rq, 0); if (delta) { int idx = calc_load_write_idx(); @@ -248,6 +243,24 @@ void calc_load_nohz_start(void) } } +void calc_load_nohz_start(void) +{ + /* + * We're going into NO_HZ mode, if there's any pending delta, fold it + * into the pending NO_HZ delta. + */ + calc_load_nohz_fold(this_rq()); +} + +/* + * Keep track of the load for NOHZ_FULL, must be called between + * calc_load_nohz_{start,stop}(). + */ +void calc_load_nohz_remote(struct rq *rq) +{ + calc_load_nohz_fold(rq); +} + void calc_load_nohz_stop(void) { struct rq *this_rq = this_rq(); @@ -268,7 +281,7 @@ void calc_load_nohz_stop(void) this_rq->calc_load_update += LOAD_FREQ; } -static long calc_load_nohz_fold(void) +static long calc_load_nohz_read(void) { int idx = calc_load_read_idx(); long delta = 0; @@ -323,7 +336,7 @@ static void calc_global_nohz(void) } #else /* !CONFIG_NO_HZ_COMMON */ -static inline long calc_load_nohz_fold(void) { return 0; } +static inline long calc_load_nohz_read(void) { return 0; } static inline void calc_global_nohz(void) { } #endif /* CONFIG_NO_HZ_COMMON */ @@ -346,7 +359,7 @@ void calc_global_load(unsigned long ticks) /* * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs. */ - delta = calc_load_nohz_fold(); + delta = calc_load_nohz_read(); if (delta) atomic_long_add(delta, &calc_load_tasks); -- cgit v1.2.3 From b396f52326de20ec974471b7b19168867b365cbf Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 14 Jan 2020 10:13:20 +0000 Subject: sched/fair: Allow a small load imbalance between low utilisation SD_NUMA domains The CPU load balancer balances between different domains to spread load and strives to have equal balance everywhere. Communicating tasks can migrate so they are topologically close to each other but these decisions are independent. On a lightly loaded NUMA machine, two communicating tasks pulled together at wakeup time can be pushed apart by the load balancer. In isolation, the load balancer decision is fine but it ignores the tasks data locality and the wakeup/LB paths continually conflict. NUMA balancing is also a factor but it also simply conflicts with the load balancer. This patch allows a fixed degree of imbalance of two tasks to exist between NUMA domains regardless of utilisation levels. In many cases, this prevents communicating tasks being pulled apart. It was evaluated whether the imbalance should be scaled to the domain size. However, no additional benefit was measured across a range of workloads and machines and scaling adds the risk that lower domains have to be rebalanced. While this could change again in the future, such a change should specify the use case and benefit. The most obvious impact is on netperf TCP_STREAM -- two simple communicating tasks with some softirq offload depending on the transmission rate. 2-socket Haswell machine 48 core, HT enabled netperf-tcp -- mmtests config config-network-netperf-unbound baseline lbnuma-v3 Hmean 64 568.73 ( 0.00%) 577.56 * 1.55%* Hmean 128 1089.98 ( 0.00%) 1128.06 * 3.49%* Hmean 256 2061.72 ( 0.00%) 2104.39 * 2.07%* Hmean 1024 7254.27 ( 0.00%) 7557.52 * 4.18%* Hmean 2048 11729.20 ( 0.00%) 13350.67 * 13.82%* Hmean 3312 15309.08 ( 0.00%) 18058.95 * 17.96%* Hmean 4096 17338.75 ( 0.00%) 20483.66 * 18.14%* Hmean 8192 25047.12 ( 0.00%) 27806.84 * 11.02%* Hmean 16384 27359.55 ( 0.00%) 33071.88 * 20.88%* Stddev 64 2.16 ( 0.00%) 2.02 ( 6.53%) Stddev 128 2.31 ( 0.00%) 2.19 ( 5.05%) Stddev 256 11.88 ( 0.00%) 3.22 ( 72.88%) Stddev 1024 23.68 ( 0.00%) 7.24 ( 69.43%) Stddev 2048 79.46 ( 0.00%) 71.49 ( 10.03%) Stddev 3312 26.71 ( 0.00%) 57.80 (-116.41%) Stddev 4096 185.57 ( 0.00%) 96.15 ( 48.19%) Stddev 8192 245.80 ( 0.00%) 100.73 ( 59.02%) Stddev 16384 207.31 ( 0.00%) 141.65 ( 31.67%) In this case, there was a sizable improvement to performance and a general reduction in variance. However, this is not univeral. For most machines, the impact was roughly a 3% performance gain. Ops NUMA base-page range updates 19796.00 292.00 Ops NUMA PTE updates 19796.00 292.00 Ops NUMA PMD updates 0.00 0.00 Ops NUMA hint faults 16113.00 143.00 Ops NUMA hint local faults % 8407.00 142.00 Ops NUMA hint local percent 52.18 99.30 Ops NUMA pages migrated 4244.00 1.00 Without the patch, only 52.18% of sampled accesses are local. In an earlier changelog, 100% of sampled accesses are local and indeed on most machines, this was still the case. In this specific case, the local sampled rates was 99.3% but note the "base-page range updates" and "PTE updates". The activity with the patch is negligible as were the number of faults. The small number of pages migrated were related to shared libraries. A 2-socket Broadwell showed better results on average but are not presented for brevity as the performance was similar except it showed 100% of the sampled NUMA hints were local. The patch holds up for a 4-socket Haswell, an AMD EPYC and AMD Epyc 2 machine. For dbench, the impact depends on the filesystem used and the number of clients. On XFS, there is little difference as the clients typically communicate with workqueues which have a separate class of scheduler problem at the moment. For ext4, performance is generally better, particularly for small numbers of clients as NUMA balancing activity is negligible with the patch applied. A more interesting example is the Facebook schbench which uses a number of messaging threads to communicate with worker threads. In this configuration, one messaging thread is used per NUMA node and the number of worker threads is varied. The 50, 75, 90, 95, 99, 99.5 and 99.9 percentiles for response latency is then reported. Lat 50.00th-qrtle-1 44.00 ( 0.00%) 37.00 ( 15.91%) Lat 75.00th-qrtle-1 53.00 ( 0.00%) 41.00 ( 22.64%) Lat 90.00th-qrtle-1 57.00 ( 0.00%) 42.00 ( 26.32%) Lat 95.00th-qrtle-1 63.00 ( 0.00%) 43.00 ( 31.75%) Lat 99.00th-qrtle-1 76.00 ( 0.00%) 51.00 ( 32.89%) Lat 99.50th-qrtle-1 89.00 ( 0.00%) 52.00 ( 41.57%) Lat 99.90th-qrtle-1 98.00 ( 0.00%) 55.00 ( 43.88%) Lat 50.00th-qrtle-2 42.00 ( 0.00%) 42.00 ( 0.00%) Lat 75.00th-qrtle-2 48.00 ( 0.00%) 47.00 ( 2.08%) Lat 90.00th-qrtle-2 53.00 ( 0.00%) 52.00 ( 1.89%) Lat 95.00th-qrtle-2 55.00 ( 0.00%) 53.00 ( 3.64%) Lat 99.00th-qrtle-2 62.00 ( 0.00%) 60.00 ( 3.23%) Lat 99.50th-qrtle-2 63.00 ( 0.00%) 63.00 ( 0.00%) Lat 99.90th-qrtle-2 68.00 ( 0.00%) 66.00 ( 2.94% For higher worker threads, the differences become negligible but it's interesting to note the difference in wakeup latency at low utilisation and mpstat confirms that activity was almost all on one node until the number of worker threads increase. Hackbench generally showed neutral results across a range of machines. This is different to earlier versions of the patch which allowed imbalances for higher degrees of utilisation. perf bench pipe showed negligible differences in overall performance as the differences are very close to the noise. An earlier prototype of the patch showed major regressions for NAS C-class when running with only half of the available CPUs -- 20-30% performance hits were measured at the time. With this version of the patch, the impact is negligible with small gains/losses within the noise measured. This is because the number of threads far exceeds the small imbalance the aptch cares about. Similarly, there were report of regressions for the autonuma benchmark against earlier versions but again, normal load balancing now applies for that workload. In general, the patch simply seeks to avoid unnecessary cross-node migrations in the basic case where imbalances are very small. For low utilisation communicating workloads, this patch generally behaves better with less NUMA balancing activity. For high utilisation, there is no change in behaviour. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Reviewed-by: Srikar Dronamraju Acked-by: Phil Auld Tested-by: Phil Auld Link: https://lkml.kernel.org/r/20200114101319.GO3466@techsingularity.net --- kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe4e0d775375..25dffc03f0f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8658,10 +8658,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* * Try to use spare capacity of local group without overloading it or * emptying busiest. - * XXX Spreading tasks across NUMA nodes is not always the best policy - * and special care should be taken for SD_NUMA domain level before - * spreading the tasks. For now, load_balance() fully relies on - * NUMA_BALANCING and fbq_classify_group/rq to override the decision. */ if (local->group_type == group_has_spare) { if (busiest->group_type > group_fully_busy) { @@ -8701,16 +8697,37 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s env->migration_type = migrate_task; lsub_positive(&nr_diff, local->sum_nr_running); env->imbalance = nr_diff >> 1; - return; - } + } else { - /* - * If there is no overload, we just want to even the number of - * idle cpus. - */ - env->migration_type = migrate_task; - env->imbalance = max_t(long, 0, (local->idle_cpus - + /* + * If there is no overload, we just want to even the number of + * idle cpus. + */ + env->migration_type = migrate_task; + env->imbalance = max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1); + } + + /* Consider allowing a small imbalance between NUMA groups */ + if (env->sd->flags & SD_NUMA) { + unsigned int imbalance_min; + + /* + * Compute an allowed imbalance based on a simple + * pair of communicating tasks that should remain + * local and ignore them. + * + * NOTE: Generally this would have been based on + * the domain size and this was evaluated. However, + * the benefit is similar across a range of workloads + * and machines but scaling by the domain size adds + * the risk that lower domains have to be rebalanced. + */ + imbalance_min = 2; + if (busiest->sum_nr_running <= imbalance_min) + env->imbalance = 0; + } + return; } -- cgit v1.2.3 From b562d140649966d4daedd0483a8fe59ad3bb465a Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 14 Jan 2020 21:09:47 +0000 Subject: sched/uclamp: Reject negative values in cpu_uclamp_write() The check to ensure that the new written value into cpu.uclamp.{min,max} is within range, [0:100], wasn't working because of the signed comparison 7301 if (req.percent > UCLAMP_PERCENT_SCALE) { 7302 req.ret = -ERANGE; 7303 return req; 7304 } # echo -1 > cpu.uclamp.min # cat cpu.uclamp.min 42949671.96 Cast req.percent into u64 to force the comparison to be unsigned and work as intended in capacity_from_percent(). # echo -1 > cpu.uclamp.min sh: write error: Numerical result out of range Fixes: 2480c093130f ("sched/uclamp: Extend CPU's cgroup controller") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200114210947.14083-1-qais.yousef@arm.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4ff03c27779e..55b9a9c53b91 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7264,7 +7264,7 @@ capacity_from_percent(char *buf) &req.percent); if (req.ret) return req; - if (req.percent > UCLAMP_PERCENT_SCALE) { + if ((u64)req.percent > UCLAMP_PERCENT_SCALE) { req.ret = -ERANGE; return req; } -- cgit v1.2.3 From e938b9c94164e4d981039f1cf6007d7453883e5a Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 13 Jan 2020 08:50:27 +0800 Subject: sched/nohz: Optimize get_nohz_timer_target() On a machine, CPU 0 is used for housekeeping, the other 39 CPUs in the same socket are in nohz_full mode. We can observe huge time burn in the loop for seaching nearest busy housekeeper cpu by ftrace. 2) | get_nohz_timer_target() { 2) 0.240 us | housekeeping_test_cpu(); 2) 0.458 us | housekeeping_test_cpu(); ... 2) 0.292 us | housekeeping_test_cpu(); 2) 0.240 us | housekeeping_test_cpu(); 2) 0.227 us | housekeeping_any_cpu(); 2) + 43.460 us | } This patch optimizes the searching logic by finding a nearest housekeeper CPU in the housekeeping cpumask, it can minimize the worst searching time from ~44us to < 10us in my testing. In addition, the last iterated busy housekeeper can become a random candidate while current CPU is a better fallback if it is a housekeeper. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/1578876627-11938-1-git-send-email-wanpengli@tencent.com --- kernel/sched/core.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 55b9a9c53b91..a8a5d5b6f5cf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -552,27 +552,32 @@ void resched_cpu(int cpu) */ int get_nohz_timer_target(void) { - int i, cpu = smp_processor_id(); + int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; - if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) - return cpu; + if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { + if (!idle_cpu(cpu)) + return cpu; + default_cpu = cpu; + } rcu_read_lock(); for_each_domain(cpu, sd) { - for_each_cpu(i, sched_domain_span(sd)) { + for_each_cpu_and(i, sched_domain_span(sd), + housekeeping_cpumask(HK_FLAG_TIMER)) { if (cpu == i) continue; - if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { + if (!idle_cpu(i)) { cpu = i; goto unlock; } } } - if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) - cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + if (default_cpu == -1) + default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + cpu = default_cpu; unlock: rcu_read_unlock(); return cpu; -- cgit v1.2.3 From 2a4b03ffc69f2dedc6388e9a6438b5f4c133a40d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 14 Jan 2020 15:13:56 +0100 Subject: sched/fair: Prevent unlimited runtime on throttled group When a running task is moved on a throttled task group and there is no other task enqueued on the CPU, the task can keep running using 100% CPU whatever the allocated bandwidth for the group and although its cfs rq is throttled. Furthermore, the group entity of the cfs_rq and its parents are not enqueued but only set as curr on their respective cfs_rqs. We have the following sequence: sched_move_task -dequeue_task: dequeue task and group_entities. -put_prev_task: put task and group entities. -sched_change_group: move task to new group. -enqueue_task: enqueue only task but not group entities because cfs_rq is throttled. -set_next_task : set task and group_entities as current sched_entity of their cfs_rq. Another impact is that the root cfs_rq runnable_load_avg at root rq stays null because the group_entities are not enqueued. This situation will stay the same until an "external" event triggers a reschedule. Let trigger it immediately instead. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Ben Segall Link: https://lkml.kernel.org/r/1579011236-31256-1-git-send-email-vincent.guittot@linaro.org --- kernel/sched/core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a8a5d5b6f5cf..89e54f3ed571 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7072,8 +7072,15 @@ void sched_move_task(struct task_struct *tsk) if (queued) enqueue_task(rq, tsk, queue_flags); - if (running) + if (running) { set_next_task(rq, tsk); + /* + * After changing group, the running task may have joined a + * throttled one but it's still the running task. Trigger a + * resched to make sure that task can still run. + */ + resched_curr(rq); + } task_rq_unlock(rq, tsk, &rf); } -- cgit v1.2.3 From b527b638fd63ba791dc90a0a6e9a3035b10df52b Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 28 Jun 2019 12:40:20 -0500 Subject: tracing: Simplify assignment parsing for hist triggers In the process of adding better error messages for sorting, I realized that strsep was being used incorrectly and some of the error paths I was expecting to be hit weren't and just fell through to the common invalid key error case. It also became obvious that for keyword assignments, it wasn't necessary to save the full assignment and reparse it later, and having a common empty-assignment check would also make more sense in terms of error processing. Change the code to fix these problems and simplify it for new error message changes in a subsequent patch. Link: http://lkml.kernel.org/r/1c3ef0b6655deaf345f6faee2584a0298ac2d743.1561743018.git.zanussi@kernel.org Fixes: e62347d24534 ("tracing: Add hist trigger support for user-defined sorting ('sort=' param)") Fixes: 7ef224d1d0e3 ("tracing: Add 'hist' event trigger command") Fixes: a4072fe85ba3 ("tracing: Add a clock attribute for hist triggers") Reported-by: Masami Hiramatsu Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 70 ++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 117a1202a6b9..bf2bcb8d7725 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2039,12 +2039,6 @@ static int parse_map_size(char *str) unsigned long size, map_bits; int ret; - strsep(&str, "="); - if (!str) { - ret = -EINVAL; - goto out; - } - ret = kstrtoul(str, 0, &size); if (ret) goto out; @@ -2104,25 +2098,25 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs) static int parse_assignment(struct trace_array *tr, char *str, struct hist_trigger_attrs *attrs) { - int ret = 0; + int len, ret = 0; - if ((str_has_prefix(str, "key=")) || - (str_has_prefix(str, "keys="))) { - attrs->keys_str = kstrdup(str, GFP_KERNEL); + if ((len = str_has_prefix(str, "key=")) || + (len = str_has_prefix(str, "keys="))) { + attrs->keys_str = kstrdup(str + len, GFP_KERNEL); if (!attrs->keys_str) { ret = -ENOMEM; goto out; } - } else if ((str_has_prefix(str, "val=")) || - (str_has_prefix(str, "vals=")) || - (str_has_prefix(str, "values="))) { - attrs->vals_str = kstrdup(str, GFP_KERNEL); + } else if ((len = str_has_prefix(str, "val=")) || + (len = str_has_prefix(str, "vals=")) || + (len = str_has_prefix(str, "values="))) { + attrs->vals_str = kstrdup(str + len, GFP_KERNEL); if (!attrs->vals_str) { ret = -ENOMEM; goto out; } - } else if (str_has_prefix(str, "sort=")) { - attrs->sort_key_str = kstrdup(str, GFP_KERNEL); + } else if ((len = str_has_prefix(str, "sort="))) { + attrs->sort_key_str = kstrdup(str + len, GFP_KERNEL); if (!attrs->sort_key_str) { ret = -ENOMEM; goto out; @@ -2133,12 +2127,8 @@ static int parse_assignment(struct trace_array *tr, ret = -ENOMEM; goto out; } - } else if (str_has_prefix(str, "clock=")) { - strsep(&str, "="); - if (!str) { - ret = -EINVAL; - goto out; - } + } else if ((len = str_has_prefix(str, "clock="))) { + str += len; str = strstrip(str); attrs->clock = kstrdup(str, GFP_KERNEL); @@ -2146,8 +2136,8 @@ static int parse_assignment(struct trace_array *tr, ret = -ENOMEM; goto out; } - } else if (str_has_prefix(str, "size=")) { - int map_bits = parse_map_size(str); + } else if ((len = str_has_prefix(str, "size="))) { + int map_bits = parse_map_size(str + len); if (map_bits < 0) { ret = map_bits; @@ -2187,8 +2177,14 @@ parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str) while (trigger_str) { char *str = strsep(&trigger_str, ":"); + char *rhs; - if (strchr(str, '=')) { + rhs = strchr(str, '='); + if (rhs) { + if (!strlen(++rhs)) { + ret = -EINVAL; + goto free; + } ret = parse_assignment(tr, str, attrs); if (ret) goto free; @@ -4522,10 +4518,6 @@ static int create_val_fields(struct hist_trigger_data *hist_data, if (!fields_str) goto out; - strsep(&fields_str, "="); - if (!fields_str) - goto out; - for (i = 0, j = 1; i < TRACING_MAP_VALS_MAX && j < TRACING_MAP_VALS_MAX; i++) { field_str = strsep(&fields_str, ","); @@ -4620,10 +4612,6 @@ static int create_key_fields(struct hist_trigger_data *hist_data, if (!fields_str) goto out; - strsep(&fields_str, "="); - if (!fields_str) - goto out; - for (i = n_vals; i < n_vals + TRACING_MAP_KEYS_MAX; i++) { field_str = strsep(&fields_str, ","); if (!field_str) @@ -4781,12 +4769,6 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) if (!fields_str) goto out; - strsep(&fields_str, "="); - if (!fields_str) { - ret = -EINVAL; - goto out; - } - for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) { struct hist_field *hist_field; char *field_str, *field_name; @@ -4795,9 +4777,11 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) sort_key = &hist_data->sort_keys[i]; field_str = strsep(&fields_str, ","); - if (!field_str) { - if (i == 0) - ret = -EINVAL; + if (!field_str) + break; + + if (!*field_str) { + ret = -EINVAL; break; } @@ -4807,7 +4791,7 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) } field_name = strsep(&field_str, "."); - if (!field_name) { + if (!field_name || !*field_name) { ret = -EINVAL; break; } -- cgit v1.2.3 From 4de26c8c967d55551d3983771116a2c3c0a4f464 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 28 Jun 2019 12:40:21 -0500 Subject: tracing: Add hist trigger error messages for sort specification Add error codes and messages for all the error paths leading to sort specification parsing errors. Link: http://lkml.kernel.org/r/237830dc05e583fbb53664d817a784297bf961be.1561743018.git.zanussi@kernel.org Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index bf2bcb8d7725..23458ba9e5f5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -66,7 +66,12 @@ C(INVALID_SUBSYS_EVENT, "Invalid subsystem or event name"), \ C(INVALID_REF_KEY, "Using variable references in keys not supported"), \ C(VAR_NOT_FOUND, "Couldn't find variable"), \ - C(FIELD_NOT_FOUND, "Couldn't find field"), + C(FIELD_NOT_FOUND, "Couldn't find field"), \ + C(EMPTY_ASSIGNMENT, "Empty assignment"), \ + C(INVALID_SORT_MODIFIER,"Invalid sort modifier"), \ + C(EMPTY_SORT_FIELD, "Empty sort field"), \ + C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \ + C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), #undef C #define C(a, b) HIST_ERR_##a @@ -2183,6 +2188,7 @@ parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str) if (rhs) { if (!strlen(++rhs)) { ret = -EINVAL; + hist_err(tr, HIST_ERR_EMPTY_ASSIGNMENT, errpos(str)); goto free; } ret = parse_assignment(tr, str, attrs); @@ -4743,7 +4749,7 @@ static int create_hist_fields(struct hist_trigger_data *hist_data, return ret; } -static int is_descending(const char *str) +static int is_descending(struct trace_array *tr, const char *str) { if (!str) return 0; @@ -4754,11 +4760,14 @@ static int is_descending(const char *str) if (strcmp(str, "ascending") == 0) return 0; + hist_err(tr, HIST_ERR_INVALID_SORT_MODIFIER, errpos((char *)str)); + return -EINVAL; } static int create_sort_keys(struct hist_trigger_data *hist_data) { + struct trace_array *tr = hist_data->event_file->tr; char *fields_str = hist_data->attrs->sort_key_str; struct tracing_map_sort_key *sort_key; int descending, ret = 0; @@ -4782,10 +4791,12 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) if (!*field_str) { ret = -EINVAL; + hist_err(tr, HIST_ERR_EMPTY_SORT_FIELD, errpos("sort=")); break; } if ((i == TRACING_MAP_SORT_KEYS_MAX - 1) && fields_str) { + hist_err(tr, HIST_ERR_TOO_MANY_SORT_FIELDS, errpos("sort=")); ret = -EINVAL; break; } @@ -4793,11 +4804,12 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) field_name = strsep(&field_str, "."); if (!field_name || !*field_name) { ret = -EINVAL; + hist_err(tr, HIST_ERR_EMPTY_SORT_FIELD, errpos("sort=")); break; } if (strcmp(field_name, "hitcount") == 0) { - descending = is_descending(field_str); + descending = is_descending(tr, field_str); if (descending < 0) { ret = descending; break; @@ -4819,7 +4831,7 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) if (strcmp(field_name, test_name) == 0) { sort_key->field_idx = idx; - descending = is_descending(field_str); + descending = is_descending(tr, field_str); if (descending < 0) { ret = descending; goto out; @@ -4830,6 +4842,7 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) } if (j == hist_data->n_fields) { ret = -EINVAL; + hist_err(tr, HIST_ERR_INVALID_SORT_FIELD, errpos(field_name)); break; } } -- cgit v1.2.3 From d0a497066f92eee6e5750af6a0ca32866030931a Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 28 Jun 2019 12:40:22 -0500 Subject: tracing: Add 'hist:' to hist trigger error log error string The 'hist:' prefix gets stripped from the command text during command processing, but should be added back when displaying the command during error processing. Not only because it's what should be displayed but also because not having it means the test cases fail because the caret is miscalculated by the length of the prefix string. Link: http://lkml.kernel.org/r/449df721f560042e22382f67574bcc5b4d830d3d.1561743018.git.zanussi@kernel.org Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 23458ba9e5f5..c322826e0726 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -612,7 +612,8 @@ static void last_cmd_set(struct trace_event_file *file, char *str) if (!str) return; - strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1); + strcpy(last_cmd, "hist:"); + strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:")); if (file) { call = file->event_call; -- cgit v1.2.3 From 76a598ec8c4fde58aab79b9f7c40c33d54eca67b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 29 Jan 2020 18:36:35 +0900 Subject: tracing/boot: Include required headers and sort it alphabetically Include some required (but currently indirectly included) headers and sort it alphabetically. Link: http://lkml.kernel.org/r/158029059514.12381.6597832266860248781.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 2f616cd926b0..5aad41961f03 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -6,9 +6,16 @@ #define pr_fmt(fmt) "trace_boot: " fmt +#include +#include #include #include -#include +#include +#include +#include +#include +#include +#include #include "trace.h" -- cgit v1.2.3 From 5c3469cb899abe998299aafb8f16f325d62d2d68 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 29 Jan 2020 18:36:44 +0900 Subject: tracing/boot: Move external function declarations to kernel/trace/trace.h Move external function declarations into kernel/trace/trace.h from trace_boot.c for tracing subsystem internal use. Link: http://lkml.kernel.org/r/158029060405.12381.11944554430359702545.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 17 +++++++++++++++++ kernel/trace/trace_boot.c | 15 --------------- 2 files changed, 17 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6bb64d89c321..b3075b637d14 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1157,6 +1157,11 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd); void ftrace_create_filter_files(struct ftrace_ops *ops, struct dentry *parent); void ftrace_destroy_filter_files(struct ftrace_ops *ops); + +extern int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset); +extern int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset); #else struct ftrace_func_command; @@ -1905,6 +1910,15 @@ void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); +/* Used from boot time tracer */ +extern int trace_set_options(struct trace_array *tr, char *option); +extern int tracing_set_tracer(struct trace_array *tr, const char *buf); +extern ssize_t tracing_resize_ring_buffer(struct trace_array *tr, + unsigned long size, int cpu_id); +extern int tracing_set_cpumask(struct trace_array *tr, + cpumask_var_t tracing_cpumask_new); + + #define MAX_EVENT_NAME_LEN 64 extern int trace_run_command(const char *buf, int (*createfn)(int, char**)); @@ -1964,6 +1978,9 @@ static inline const char *get_syscall_name(int syscall) #ifdef CONFIG_EVENT_TRACING void trace_event_init(void); void trace_event_eval_update(struct trace_eval_map **map, int len); +/* Used from boot time tracer */ +extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); +extern int trigger_process_regex(struct trace_event_file *file, char *buff); #else static inline void __init trace_event_init(void) { } static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 5aad41961f03..4d37bf5c3742 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -21,13 +21,6 @@ #define MAX_BUF_LEN 256 -extern int trace_set_options(struct trace_array *tr, char *option); -extern int tracing_set_tracer(struct trace_array *tr, const char *buf); -extern ssize_t tracing_resize_ring_buffer(struct trace_array *tr, - unsigned long size, int cpu_id); -extern int tracing_set_cpumask(struct trace_array *tr, - cpumask_var_t tracing_cpumask_new); - static void __init trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) { @@ -76,9 +69,6 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) } #ifdef CONFIG_EVENT_TRACING -extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); -extern int trigger_process_regex(struct trace_event_file *file, char *buff); - static void __init trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node) { @@ -252,11 +242,6 @@ trace_boot_init_events(struct trace_array *tr, struct xbc_node *node) #endif #ifdef CONFIG_DYNAMIC_FTRACE -extern bool ftrace_filter_param __initdata; -extern int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, - int len, int reset); -extern int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, - int len, int reset); static void __init trace_boot_set_ftrace_filter(struct trace_array *tr, struct xbc_node *node) { -- cgit v1.2.3 From d3e42bb0a329fadff98fcb927714d0a486840e3b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 27 Jan 2020 09:51:45 -0800 Subject: bpf: Reuse log from btf_prase_vmlinux() in btf_struct_ops_init() Instead of using a locally defined "struct bpf_verifier_log log = {}", btf_struct_ops_init() should reuse the "log" from its calling function "btf_parse_vmlinux()". It should also resolve the frame-size too large compiler warning in some ARCH. Fixes: 27ae7997a661 ("bpf: Introduce BPF_PROG_TYPE_STRUCT_OPS") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200127175145.1154438-1-kafai@fb.com --- kernel/bpf/bpf_struct_ops.c | 5 ++--- kernel/bpf/btf.c | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 8ad1c9ea26b2..042f95534f86 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -96,12 +96,11 @@ const struct bpf_prog_ops bpf_struct_ops_prog_ops = { static const struct btf_type *module_type; -void bpf_struct_ops_init(struct btf *btf) +void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log) { s32 type_id, value_id, module_id; const struct btf_member *member; struct bpf_struct_ops *st_ops; - struct bpf_verifier_log log = {}; const struct btf_type *t; char value_name[128]; const char *mname; @@ -172,7 +171,7 @@ void bpf_struct_ops_init(struct btf *btf) member->type, NULL); if (func_proto && - btf_distill_func_proto(&log, btf, + btf_distill_func_proto(log, btf, func_proto, mname, &st_ops->func_models[j])) { pr_warn("Error in parsing func ptr %s in struct %s\n", diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b7c1660fb594..8c9d8f266bef 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3643,7 +3643,7 @@ struct btf *btf_parse_vmlinux(void) goto errout; } - bpf_struct_ops_init(btf); + bpf_struct_ops_init(btf, log); btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); -- cgit v1.2.3 From 8c8c5a4994a306c217fd061cbfc5903399fd4c1c Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Fri, 10 Jan 2020 18:19:33 +0100 Subject: dma-contiguous: CMA: give precedence to cmdline Although the device tree might contain a reserved-memory DT node dedicated as the default CMA pool, users might want to change CMA's parameters using the kernel command line for debugging purposes and whatnot. Honor this by bypassing the reserved memory CMA setup, which will ultimately end up freeing the memblock and allow the command line CMA configuration routine to run. Signed-off-by: Nicolas Saenz Julienne Reviewed-by: Phil Elwell Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index daa4e6eefdde..8bc6f2d670f9 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -302,9 +302,16 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); phys_addr_t mask = align - 1; unsigned long node = rmem->fdt_node; + bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); struct cma *cma; int err; + if (size_cmdline != -1 && default_cma) { + pr_info("Reserved memory: bypass %s node, using cmdline CMA params instead\n", + rmem->name); + return -EBUSY; + } + if (!of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; @@ -322,7 +329,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) /* Architecture specific contiguous memory fixup. */ dma_contiguous_early_fixup(rmem->base, rmem->size); - if (of_get_flat_dt_prop(node, "linux,cma-default", NULL)) + if (default_cma) dma_contiguous_set_default(cma); rmem->ops = &rmem_cma_ops; -- cgit v1.2.3 From 64ae572bc7d0060429e40e1c8d803ce5eb31a0d6 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Sat, 17 Aug 2019 10:12:08 -0400 Subject: tracing: Fix sched switch start/stop refcount racy updates Reading the sched_cmdline_ref and sched_tgid_ref initial state within tracing_start_sched_switch without holding the sched_register_mutex is racy against concurrent updates, which can lead to tracepoint probes being registered more than once (and thus trigger warnings within tracepoint.c). [ May be the fix for this bug ] Link: https://lore.kernel.org/r/000000000000ab6f84056c786b93@google.com Link: http://lkml.kernel.org/r/20190817141208.15226-1-mathieu.desnoyers@efficios.com Cc: stable@vger.kernel.org CC: Steven Rostedt (VMware) CC: Joel Fernandes (Google) CC: Peter Zijlstra CC: Thomas Gleixner CC: Paul E. McKenney Reported-by: syzbot+774fddf07b7ab29a1e55@syzkaller.appspotmail.com Fixes: d914ba37d7145 ("tracing: Add support for recording tgid of tasks") Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_sched_switch.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index e288168661e1..e304196d7c28 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -89,8 +89,10 @@ static void tracing_sched_unregister(void) static void tracing_start_sched_switch(int ops) { - bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref); + bool sched_register; + mutex_lock(&sched_register_mutex); + sched_register = (!sched_cmdline_ref && !sched_tgid_ref); switch (ops) { case RECORD_CMDLINE: -- cgit v1.2.3 From e4075e8bdffd93a9b6d6e1d52fabedceeca5a91b Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 24 Jan 2020 10:02:56 +0300 Subject: ftrace: fpid_next() should increase position index if seq_file .next fuction does not change position index, read after some lseek can generate unexpected output. Without patch: # dd bs=4 skip=1 if=/sys/kernel/tracing/set_ftrace_pid dd: /sys/kernel/tracing/set_ftrace_pid: cannot skip to specified offset id no pid 2+1 records in 2+1 records out 10 bytes copied, 0.000213285 s, 46.9 kB/s Notice the "id" followed by "no pid". With the patch: # dd bs=4 skip=1 if=/sys/kernel/tracing/set_ftrace_pid dd: /sys/kernel/tracing/set_ftrace_pid: cannot skip to specified offset id 0+1 records in 0+1 records out 3 bytes copied, 0.000202112 s, 14.8 kB/s Notice that it only prints "id" and not the "no pid" afterward. Link: http://lkml.kernel.org/r/4f87c6ad-f114-30bb-8506-c32274ce2992@virtuozzo.com https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fdb1a9532420..0e9612c30995 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7026,9 +7026,10 @@ static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) struct trace_array *tr = m->private; struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids); - if (v == FTRACE_NO_PIDS) + if (v == FTRACE_NO_PIDS) { + (*pos)++; return NULL; - + } return trace_pid_next(pid_list, v, pos); } -- cgit v1.2.3 From 039958a5f7aad695d4d52683c7d48aa13fb18249 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 24 Jan 2020 10:03:01 +0300 Subject: tracing: eval_map_next() should always increase position index if seq_file .next fuction does not change position index, read after some lseek can generate unexpected output. Link: http://lkml.kernel.org/r/7ad85b22-1866-977c-db17-88ac438bc764@virtuozzo.com Signed-off-by: Vasily Averin [ This is not a bug fix, it just makes it "technically correct" which is why I applied it. NULL is only returned on an anomaly which triggers a WARN_ON ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6a28b1b9bf42..8d144fd94aa8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5399,14 +5399,12 @@ static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos) * Paranoid! If ptr points to end, we don't want to increment past it. * This really should never happen. */ + (*pos)++; ptr = update_eval_map(ptr); if (WARN_ON_ONCE(!ptr)) return NULL; ptr++; - - (*pos)++; - ptr = update_eval_map(ptr); return ptr; -- cgit v1.2.3 From 6722b23e7a2ace078344064a9735fb73e554e9ef Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 24 Jan 2020 10:03:06 +0300 Subject: trigger_next should increase position index if seq_file .next fuction does not change position index, read after some lseek can generate unexpected output. Without patch: # dd bs=30 skip=1 if=/sys/kernel/tracing/events/sched/sched_switch/trigger dd: /sys/kernel/tracing/events/sched/sched_switch/trigger: cannot skip to specified offset n traceoff snapshot stacktrace enable_event disable_event enable_hist disable_hist hist # Available triggers: # traceon traceoff snapshot stacktrace enable_event disable_event enable_hist disable_hist hist 6+1 records in 6+1 records out 206 bytes copied, 0.00027916 s, 738 kB/s Notice the printing of "# Available triggers:..." after the line. With the patch: # dd bs=30 skip=1 if=/sys/kernel/tracing/events/sched/sched_switch/trigger dd: /sys/kernel/tracing/events/sched/sched_switch/trigger: cannot skip to specified offset n traceoff snapshot stacktrace enable_event disable_event enable_hist disable_hist hist 2+1 records in 2+1 records out 88 bytes copied, 0.000526867 s, 167 kB/s It only prints the end of the file, and does not restart. Link: http://lkml.kernel.org/r/3c35ee24-dd3a-8119-9c19-552ed253388a@virtuozzo.com https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_trigger.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 60959c31791d..dd34a1b46a86 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -116,9 +116,10 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) { struct trace_event_file *event_file = event_file_data(m->private); - if (t == SHOW_AVAILABLE_TRIGGERS) + if (t == SHOW_AVAILABLE_TRIGGERS) { + (*pos)++; return NULL; - + } return seq_list_next(t, &event_file->triggers, pos); } -- cgit v1.2.3 From 89c95fcef1942415e0f20d8c82e6e36ff8eeca9c Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 29 Jan 2020 12:59:21 -0600 Subject: tracing: Add trace_array_find/_get() to find instance trace arrays Add a new trace_array_find() function that can be used to find a trace array given the instance name, and replace existing code that does the same thing with it. Also add trace_array_find_get() which does the same but returns the trace array after upping its refcount. Also make both available for use outside of trace.c. Link: http://lkml.kernel.org/r/cb68528c975eba95bee4561ac67dd1499423b2e5.1580323897.git.zanussi@kernel.org Acked-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 43 +++++++++++++++++++++++++++++++++---------- kernel/trace/trace.h | 2 ++ 2 files changed, 35 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8d144fd94aa8..183b031a3828 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8499,6 +8499,34 @@ static void update_tracer_options(struct trace_array *tr) mutex_unlock(&trace_types_lock); } +/* Must have trace_types_lock held */ +struct trace_array *trace_array_find(const char *instance) +{ + struct trace_array *tr, *found = NULL; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, instance) == 0) { + found = tr; + break; + } + } + + return found; +} + +struct trace_array *trace_array_find_get(const char *instance) +{ + struct trace_array *tr; + + mutex_lock(&trace_types_lock); + tr = trace_array_find(instance); + if (tr) + tr->ref++; + mutex_unlock(&trace_types_lock); + + return tr; +} + static struct trace_array *trace_array_create(const char *name) { struct trace_array *tr; @@ -8575,10 +8603,8 @@ static int instance_mkdir(const char *name) mutex_lock(&trace_types_lock); ret = -EEXIST; - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr->name && strcmp(tr->name, name) == 0) - goto out_unlock; - } + if (trace_array_find(name)) + goto out_unlock; tr = trace_array_create(name); @@ -8706,12 +8732,9 @@ static int instance_rmdir(const char *name) mutex_lock(&trace_types_lock); ret = -ENODEV; - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr->name && strcmp(tr->name, name) == 0) { - ret = __remove_instance(tr); - break; - } - } + tr = trace_array_find(name); + if (tr) + ret = __remove_instance(tr); mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b3075b637d14..f5480a2aa334 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -358,6 +358,8 @@ extern struct mutex trace_types_lock; extern int trace_array_get(struct trace_array *tr); extern int tracing_check_open_get_tr(struct trace_array *tr); +extern struct trace_array *trace_array_find(const char *instance); +extern struct trace_array *trace_array_find_get(const char *instance); extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); -- cgit v1.2.3 From e3e2a2cc9c96725457ad6f31712ea7681a55666e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 29 Jan 2020 12:59:22 -0600 Subject: tracing: Add trace_get/put_event_file() Add a function to get an event file and prevent it from going away on module or instance removal. trace_get_event_file() will find an event file in a given instance (if instance is NULL, it assumes the top trace array) and return it, pinning the instance's trace array as well as the event's module, if applicable, so they won't go away while in use. trace_put_event_file() does the matching release. Link: http://lkml.kernel.org/r/bb31ac4bdda168d5ed3c4b5f5a4c8f633e8d9118.1580323897.git.zanussi@kernel.org Acked-by: Masami Hiramatsu Signed-off-by: Tom Zanussi [ Moved trace_array_put() to end of trace_put_event_file() ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 85 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index dfb736a964d6..da62472b1297 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2536,6 +2536,91 @@ find_event_file(struct trace_array *tr, const char *system, const char *event) return file; } +/** + * trace_get_event_file - Find and return a trace event file + * @instance: The name of the trace instance containing the event + * @system: The name of the system containing the event + * @event: The name of the event + * + * Return a trace event file given the trace instance name, trace + * system, and trace event name. If the instance name is NULL, it + * refers to the top-level trace array. + * + * This function will look it up and return it if found, after calling + * trace_array_get() to prevent the instance from going away, and + * increment the event's module refcount to prevent it from being + * removed. + * + * To release the file, call trace_put_event_file(), which will call + * trace_array_put() and decrement the event's module refcount. + * + * Return: The trace event on success, ERR_PTR otherwise. + */ +struct trace_event_file *trace_get_event_file(const char *instance, + const char *system, + const char *event) +{ + struct trace_array *tr = top_trace_array(); + struct trace_event_file *file = NULL; + int ret = -EINVAL; + + if (instance) { + tr = trace_array_find_get(instance); + if (!tr) + return ERR_PTR(-ENOENT); + } else { + ret = trace_array_get(tr); + if (ret) + return ERR_PTR(ret); + } + + mutex_lock(&event_mutex); + + file = find_event_file(tr, system, event); + if (!file) { + trace_array_put(tr); + ret = -EINVAL; + goto out; + } + + /* Don't let event modules unload while in use */ + ret = try_module_get(file->event_call->mod); + if (!ret) { + trace_array_put(tr); + ret = -EBUSY; + goto out; + } + + ret = 0; + out: + mutex_unlock(&event_mutex); + + if (ret) + file = ERR_PTR(ret); + + return file; +} +EXPORT_SYMBOL_GPL(trace_get_event_file); + +/** + * trace_put_event_file - Release a file from trace_get_event_file() + * @file: The trace event file + * + * If a file was retrieved using trace_get_event_file(), this should + * be called when it's no longer needed. It will cancel the previous + * trace_array_get() called by that function, and decrement the + * event's module refcount. + */ +void trace_put_event_file(struct trace_event_file *file) +{ + mutex_lock(&event_mutex); + module_put(file->event_call->mod); + mutex_unlock(&event_mutex); + + trace_array_put(file->tr); +} +EXPORT_SYMBOL_GPL(trace_put_event_file); + #ifdef CONFIG_DYNAMIC_FTRACE /* Avoid typos */ -- cgit v1.2.3 From f5f6b255a253e2c3132ca283e9090a6343bfb719 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 29 Jan 2020 12:59:23 -0600 Subject: tracing: Add synth_event_delete() create_or_delete_synth_event() contains code to delete a synthetic event, which would be useful on its own - specifically, it would be useful to allow event-creating modules to call it separately. Separate out the delete code from that function and create an exported function named synth_event_delete(). Link: http://lkml.kernel.org/r/050db3b06df7f0a4b8a2922da602d1d879c7c1c2.1580323897.git.zanussi@kernel.org Acked-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 57 +++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index c322826e0726..21e316732700 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1360,29 +1360,54 @@ static int __create_synth_event(int argc, const char *name, const char **argv) goto out; } +static int destroy_synth_event(struct synth_event *se) +{ + int ret; + + if (se->ref) + ret = -EBUSY; + else { + ret = unregister_synth_event(se); + if (!ret) { + dyn_event_remove(&se->devent); + free_synth_event(se); + } + } + + return ret; +} + +/** + * synth_event_delete - Delete a synthetic event + * @event_name: The name of the new sythetic event + * + * Delete a synthetic event that was created with synth_event_create(). + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_delete(const char *event_name) +{ + struct synth_event *se = NULL; + int ret = -ENOENT; + + mutex_lock(&event_mutex); + se = find_synth_event(event_name); + if (se) + ret = destroy_synth_event(se); + mutex_unlock(&event_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_delete); + static int create_or_delete_synth_event(int argc, char **argv) { const char *name = argv[0]; - struct synth_event *event = NULL; int ret; /* trace_run_command() ensures argc != 0 */ if (name[0] == '!') { - mutex_lock(&event_mutex); - event = find_synth_event(name + 1); - if (event) { - if (event->ref) - ret = -EBUSY; - else { - ret = unregister_synth_event(event); - if (!ret) { - dyn_event_remove(&event->devent); - free_synth_event(event); - } - } - } else - ret = -ENOENT; - mutex_unlock(&event_mutex); + ret = synth_event_delete(name + 1); return ret; } -- cgit v1.2.3 From 86c5426baddae9ff192e3159b9c2e7c14e3964c6 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 29 Jan 2020 12:59:24 -0600 Subject: tracing: Add dynamic event command creation interface Add an interface used to build up dynamic event creation commands, such as synthetic and kprobe events. Interfaces specific to those particular types of events and others can be built on top of this interface. Command creation is started by first using the dynevent_cmd_init() function to initialize the dynevent_cmd object. Following that, args are appended and optionally checked by the dynevent_arg_add() and dynevent_arg_pair_add() functions, which use objects representing arguments and pairs of arguments, initialized respectively by dynevent_arg_init() and dynevent_arg_pair_init(). Finally, once all args have been successfully added, the command is finalized and actually created using dynevent_create(). The code here for actually printing into the dyn_event->cmd buffer using snprintf() etc was adapted from v4 of Masami's 'tracing/boot: Add synthetic event support' patch. Link: http://lkml.kernel.org/r/1f65fa44390b6f238f6036777c3784ced1dcc6a0.1580323897.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_dynevent.c | 240 ++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_dynevent.h | 33 ++++++ 2 files changed, 273 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 89779eb84a07..6ffdbc4fda53 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -223,3 +223,243 @@ static __init int init_dynamic_event(void) return 0; } fs_initcall(init_dynamic_event); + +/** + * dynevent_arg_add - Add an arg to a dynevent_cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event cmd + * @arg: The argument to append to the current cmd + * + * Append an argument to a dynevent_cmd. The argument string will be + * appended to the current cmd string, followed by a separator, if + * applicable. Before the argument is added, the check_arg() + * function, if defined, is called. + * + * The cmd string, separator, and check_arg() function should be set + * using the dynevent_arg_init() before any arguments are added using + * this function. + * + * Return: 0 if successful, error otherwise. + */ +int dynevent_arg_add(struct dynevent_cmd *cmd, + struct dynevent_arg *arg) +{ + int ret = 0; + int delta; + char *q; + + if (arg->check_arg) { + ret = arg->check_arg(arg); + if (ret) + return ret; + } + + q = cmd->buf + (cmd->maxlen - cmd->remaining); + + delta = snprintf(q, cmd->remaining, " %s%c", arg->str, arg->separator); + if (delta >= cmd->remaining) { + pr_err("String is too long: %s\n", arg->str); + return -E2BIG; + } + cmd->remaining -= delta; + + return ret; +} + +/** + * dynevent_arg_pair_add - Add an arg pair to a dynevent_cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event cmd + * @arg_pair: The argument pair to append to the current cmd + * + * Append an argument pair to a dynevent_cmd. An argument pair + * consists of a left-hand-side argument and a right-hand-side + * argument separated by an operator, which can be whitespace, all + * followed by a separator, if applicable. This can be used to add + * arguments of the form 'type variable_name;' or 'x+y'. + * + * The lhs argument string will be appended to the current cmd string, + * followed by an operator, if applicable, followd by the rhs string, + * followed finally by a separator, if applicable. Before anything is + * added, the check_arg() function, if defined, is called. + * + * The cmd strings, operator, separator, and check_arg() function + * should be set using the dynevent_arg_pair_init() before any arguments + * are added using this function. + * + * Return: 0 if successful, error otherwise. + */ +int dynevent_arg_pair_add(struct dynevent_cmd *cmd, + struct dynevent_arg_pair *arg_pair) +{ + int ret = 0; + int delta; + char *q; + + if (arg_pair->check_arg) { + ret = arg_pair->check_arg(arg_pair); + if (ret) + return ret; + } + + q = cmd->buf + (cmd->maxlen - cmd->remaining); + + delta = snprintf(q, cmd->remaining, " %s%c", arg_pair->lhs, + arg_pair->operator); + if (delta >= cmd->remaining) { + pr_err("field string is too long: %s\n", arg_pair->lhs); + return -E2BIG; + } + cmd->remaining -= delta; q += delta; + + delta = snprintf(q, cmd->remaining, "%s%c", arg_pair->rhs, + arg_pair->separator); + if (delta >= cmd->remaining) { + pr_err("field string is too long: %s\n", arg_pair->rhs); + return -E2BIG; + } + cmd->remaining -= delta; q += delta; + + return ret; +} + +/** + * dynevent_str_add - Add a string to a dynevent_cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event cmd + * @str: The string to append to the current cmd + * + * Append a string to a dynevent_cmd. The string will be appended to + * the current cmd string as-is, with nothing prepended or appended. + * + * Return: 0 if successful, error otherwise. + */ +int dynevent_str_add(struct dynevent_cmd *cmd, const char *str) +{ + int ret = 0; + int delta; + char *q; + + q = cmd->buf + (cmd->maxlen - cmd->remaining); + + delta = snprintf(q, cmd->remaining, "%s", str); + if (delta >= cmd->remaining) { + pr_err("String is too long: %s\n", str); + return -E2BIG; + } + cmd->remaining -= delta; + + return ret; +} + +/** + * dynevent_cmd_init - Initialize a dynevent_cmd object + * @cmd: A pointer to the dynevent_cmd struct representing the cmd + * @buf: A pointer to the buffer to generate the command into + * @maxlen: The length of the buffer the command will be generated into + * @type: The type of the cmd, checked against further operations + * @run_command: The type-specific function that will actually run the command + * + * Initialize a dynevent_cmd. A dynevent_cmd is used to build up and + * run dynamic event creation commands, such as commands for creating + * synthetic and kprobe events. Before calling any of the functions + * used to build the command, a dynevent_cmd object should be + * instantiated and initialized using this function. + * + * The initialization sets things up by saving a pointer to the + * user-supplied buffer and its length via the @buf and @maxlen + * params, and by saving the cmd-specific @type and @run_command + * params which are used to check subsequent dynevent_cmd operations + * and actually run the command when complete. + */ +void dynevent_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen, + enum dynevent_type type, + dynevent_create_fn_t run_command) +{ + memset(cmd, '\0', sizeof(*cmd)); + + cmd->buf = buf; + cmd->maxlen = maxlen; + cmd->remaining = cmd->maxlen; + cmd->type = type; + cmd->run_command = run_command; +} + +/** + * dynevent_arg_init - Initialize a dynevent_arg object + * @arg: A pointer to the dynevent_arg struct representing the arg + * @check_arg: An (optional) pointer to a function checking arg sanity + * @separator: An (optional) separator, appended after adding the arg + * + * Initialize a dynevent_arg object. A dynevent_arg represents an + * object used to append single arguments to the current command + * string. The @check_arg function, if present, will be used to check + * the sanity of the current arg string (which is directly set by the + * caller). After the arg string is successfully appended to the + * command string, the optional @separator is appended. If no + * separator was specified when initializing the arg, a space will be + * appended. + */ +void dynevent_arg_init(struct dynevent_arg *arg, + dynevent_check_arg_fn_t check_arg, + char separator) +{ + memset(arg, '\0', sizeof(*arg)); + + if (!separator) + separator = ' '; + arg->separator = separator; + + arg->check_arg = check_arg; +} + +/** + * dynevent_arg_pair_init - Initialize a dynevent_arg_pair object + * @arg_pair: A pointer to the dynevent_arg_pair struct representing the arg + * @check_arg: An (optional) pointer to a function checking arg sanity + * @operator: An (optional) operator, appended after adding the first arg + * @separator: An (optional) separator, appended after adding the second arg + * + * Initialize a dynevent_arg_pair object. A dynevent_arg_pair + * represents an object used to append argument pairs such as 'type + * variable_name;' or 'x+y' to the current command string. An + * argument pair consists of a left-hand-side argument and a + * right-hand-side argument separated by an operator, which can be + * whitespace, all followed by a separator, if applicable. The + * @check_arg function, if present, will be used to check the sanity + * of the current arg strings (which is directly set by the caller). + * After the first arg string is successfully appended to the command + * string, the optional @operator is appended, followed by the second + * arg and and optional @separator. If no separator was specified + * when initializing the arg, a space will be appended. + */ +void dynevent_arg_pair_init(struct dynevent_arg_pair *arg_pair, + dynevent_check_arg_fn_t check_arg, + char operator, char separator) +{ + memset(arg_pair, '\0', sizeof(*arg_pair)); + + if (!operator) + operator = ' '; + arg_pair->operator = operator; + + if (!separator) + separator = ' '; + arg_pair->separator = separator; + + arg_pair->check_arg = check_arg; +} + +/** + * dynevent_create - Create the dynamic event contained in dynevent_cmd + * @cmd: The dynevent_cmd object containing the dynamic event creation command + * + * Once a dynevent_cmd object has been successfully built up via the + * dynevent_cmd_init(), dynevent_arg_add() and dynevent_arg_pair_add() + * functions, this function runs the final command to actually create + * the event. + * + * Return: 0 if the event was successfully created, error otherwise. + */ +int dynevent_create(struct dynevent_cmd *cmd) +{ + return cmd->run_command(cmd); +} +EXPORT_SYMBOL_GPL(dynevent_create); diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 46898138d2df..b593fc34c5b1 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -117,4 +117,37 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type); #define for_each_dyn_event_safe(pos, n) \ list_for_each_entry_safe(pos, n, &dyn_event_list, list) +extern void dynevent_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen, + enum dynevent_type type, + dynevent_create_fn_t run_command); + +typedef int (*dynevent_check_arg_fn_t)(void *data); + +struct dynevent_arg { + const char *str; + char separator; /* e.g. ';', ',', or nothing */ + dynevent_check_arg_fn_t check_arg; +}; + +extern void dynevent_arg_init(struct dynevent_arg *arg, + dynevent_check_arg_fn_t check_arg, + char separator); +extern int dynevent_arg_add(struct dynevent_cmd *cmd, + struct dynevent_arg *arg); + +struct dynevent_arg_pair { + const char *lhs; + const char *rhs; + char operator; /* e.g. '=' or nothing */ + char separator; /* e.g. ';', ',', or nothing */ + dynevent_check_arg_fn_t check_arg; +}; + +extern void dynevent_arg_pair_init(struct dynevent_arg_pair *arg_pair, + dynevent_check_arg_fn_t check_arg, + char operator, char separator); +extern int dynevent_arg_pair_add(struct dynevent_cmd *cmd, + struct dynevent_arg_pair *arg_pair); +extern int dynevent_str_add(struct dynevent_cmd *cmd, const char *str); + #endif -- cgit v1.2.3 From 35ca5207c2d111abb9e072f028945d5c12b20836 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 29 Jan 2020 12:59:25 -0600 Subject: tracing: Add synthetic event command generation functions Add functions used to generate synthetic event commands, built on top of the dynevent_cmd interface. synth_event_gen_cmd_start() is used to create a synthetic event command using a variable arg list and synth_event_gen_cmd_array_start() does the same thing but using an array of field descriptors. synth_event_add_field(), synth_event_add_field_str() and synth_event_add_fields() can be used to add single fields one by one or as a group. Once all desired fields are added, synth_event_gen_cmd_end() is used to actually execute the command and create the event. synth_event_create() does everything, including creating the event, in a single call. Link: http://lkml.kernel.org/r/38fef702fad5ef208009f459552f34a94befd860.1580323897.git.zanussi@kernel.org Acked-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 379 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 375 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 21e316732700..5a910bb193e9 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -379,7 +379,7 @@ struct hist_trigger_data { unsigned int n_save_var_str; }; -static int synth_event_create(int argc, const char **argv); +static int create_synth_event(int argc, const char **argv); static int synth_event_show(struct seq_file *m, struct dyn_event *ev); static int synth_event_release(struct dyn_event *ev); static bool synth_event_is_busy(struct dyn_event *ev); @@ -387,7 +387,7 @@ static bool synth_event_match(const char *system, const char *event, int argc, const char **argv, struct dyn_event *ev); static struct dyn_event_operations synth_event_ops = { - .create = synth_event_create, + .create = create_synth_event, .show = synth_event_show, .is_busy = synth_event_is_busy, .free = synth_event_release, @@ -412,6 +412,7 @@ struct synth_event { struct trace_event_class class; struct trace_event_call call; struct tracepoint *tp; + struct module *mod; }; static bool is_synth_event(struct dyn_event *ev) @@ -1292,6 +1293,273 @@ struct hist_var_data { struct hist_trigger_data *hist_data; }; +static int synth_event_check_arg_fn(void *data) +{ + struct dynevent_arg_pair *arg_pair = data; + int size; + + size = synth_field_size((char *)arg_pair->lhs); + + return size ? 0 : -EINVAL; +} + +/** + * synth_event_add_field - Add a new field to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @type: The type of the new field to add + * @name: The name of the new field to add + * + * Add a new field to a synthetic event cmd object. Field ordering is in + * the same order the fields are added. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_field(struct dynevent_cmd *cmd, const char *type, + const char *name) +{ + struct dynevent_arg_pair arg_pair; + int ret; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (!type || !name) + return -EINVAL; + + dynevent_arg_pair_init(&arg_pair, synth_event_check_arg_fn, 0, ';'); + + arg_pair.lhs = type; + arg_pair.rhs = name; + + ret = dynevent_arg_pair_add(cmd, &arg_pair); + if (ret) + return ret; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) + ret = -EINVAL; + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_field); + +/** + * synth_event_add_field_str - Add a new field to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @type_name: The type and name of the new field to add, as a single string + * + * Add a new field to a synthetic event cmd object, as a single + * string. The @type_name string is expected to be of the form 'type + * name', which will be appended by ';'. No sanity checking is done - + * what's passed in is assumed to already be well-formed. Field + * ordering is in the same order the fields are added. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name) +{ + struct dynevent_arg arg; + int ret; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (!type_name) + return -EINVAL; + + dynevent_arg_init(&arg, NULL, ';'); + + arg.str = type_name; + + ret = dynevent_arg_add(cmd, &arg); + if (ret) + return ret; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) + ret = -EINVAL; + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_field_str); + +/** + * synth_event_add_fields - Add multiple fields to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * + * Add a new set of fields to a synthetic event cmd object. The event + * fields that will be defined for the event should be passed in as an + * array of struct synth_field_desc, and the number of elements in the + * array passed in as n_fields. Field ordering will retain the + * ordering given in the fields array. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_fields(struct dynevent_cmd *cmd, + struct synth_field_desc *fields, + unsigned int n_fields) +{ + unsigned int i; + int ret = 0; + + for (i = 0; i < n_fields; i++) { + if (fields[i].type == NULL || fields[i].name == NULL) { + ret = -EINVAL; + break; + } + + ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); + if (ret) + break; + } + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_fields); + +/** + * __synth_event_gen_cmd_start - Start a synthetic event command from arg list + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @name: The name of the synthetic event + * @mod: The module creating the event, NULL if not created from a module + * @args: Variable number of arg (pairs), one pair for each field + * + * NOTE: Users normally won't want to call this function directly, but + * rather use the synth_event_gen_cmd_start() wrapper, which + * automatically adds a NULL to the end of the arg list. If this + * function is used directly, make sure the last arg in the variable + * arg list is NULL. + * + * Generate a synthetic event command to be executed by + * synth_event_gen_cmd_end(). This function can be used to generate + * the complete command or only the first part of it; in the latter + * case, synth_event_add_field(), synth_event_add_field_str(), or + * synth_event_add_fields() can be used to add more fields following + * this. + * + * There should be an even number variable args, each pair consisting + * of a type followed by a field name. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name, + struct module *mod, ...) +{ + struct dynevent_arg arg; + va_list args; + int ret; + + cmd->event_name = name; + cmd->private_data = mod; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + dynevent_arg_init(&arg, NULL, 0); + arg.str = name; + ret = dynevent_arg_add(cmd, &arg); + if (ret) + return ret; + + va_start(args, mod); + for (;;) { + const char *type, *name; + + type = va_arg(args, const char *); + if (!type) + break; + name = va_arg(args, const char *); + if (!name) + break; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) { + ret = -EINVAL; + break; + } + + ret = synth_event_add_field(cmd, type, name); + if (ret) + break; + } + va_end(args); + + return ret; +} +EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start); + +/** + * synth_event_gen_cmd_array_start - Start synthetic event command from an array + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @name: The name of the synthetic event + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * + * Generate a synthetic event command to be executed by + * synth_event_gen_cmd_end(). This function can be used to generate + * the complete command or only the first part of it; in the latter + * case, synth_event_add_field(), synth_event_add_field_str(), or + * synth_event_add_fields() can be used to add more fields following + * this. + * + * The event fields that will be defined for the event should be + * passed in as an array of struct synth_field_desc, and the number of + * elements in the array passed in as n_fields. Field ordering will + * retain the ordering given in the fields array. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, + struct module *mod, + struct synth_field_desc *fields, + unsigned int n_fields) +{ + struct dynevent_arg arg; + unsigned int i; + int ret = 0; + + cmd->event_name = name; + cmd->private_data = mod; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (n_fields > SYNTH_FIELDS_MAX) + return -EINVAL; + + dynevent_arg_init(&arg, NULL, 0); + arg.str = name; + ret = dynevent_arg_add(cmd, &arg); + if (ret) + return ret; + + for (i = 0; i < n_fields; i++) { + if (fields[i].type == NULL || fields[i].name == NULL) + return -EINVAL; + + ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); + if (ret) + break; + } + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start); + static int __create_synth_event(int argc, const char *name, const char **argv) { struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; @@ -1360,6 +1628,56 @@ static int __create_synth_event(int argc, const char *name, const char **argv) goto out; } +/** + * synth_event_create - Create a new synthetic event + * @name: The name of the new sythetic event + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * @mod: The module creating the event, NULL if not created from a module + * + * Create a new synthetic event with the given name under the + * trace/events/synthetic/ directory. The event fields that will be + * defined for the event should be passed in as an array of struct + * synth_field_desc, and the number elements in the array passed in as + * n_fields. Field ordering will retain the ordering given in the + * fields array. + * + * If the new synthetic event is being created from a module, the mod + * param must be non-NULL. This will ensure that the trace buffer + * won't contain unreadable events. + * + * The new synth event should be deleted using synth_event_delete() + * function. The new synthetic event can be generated from modules or + * other kernel code using trace_synth_event() and related functions. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_create(const char *name, struct synth_field_desc *fields, + unsigned int n_fields, struct module *mod) +{ + struct dynevent_cmd cmd; + char *buf; + int ret; + + buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); + + ret = synth_event_gen_cmd_array_start(&cmd, name, mod, + fields, n_fields); + if (ret) + goto out; + + ret = synth_event_gen_cmd_end(&cmd); + out: + kfree(buf); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_create); + static int destroy_synth_event(struct synth_event *se) { int ret; @@ -1388,14 +1706,33 @@ static int destroy_synth_event(struct synth_event *se) int synth_event_delete(const char *event_name) { struct synth_event *se = NULL; + struct module *mod = NULL; int ret = -ENOENT; mutex_lock(&event_mutex); se = find_synth_event(event_name); - if (se) + if (se) { + mod = se->mod; ret = destroy_synth_event(se); + } mutex_unlock(&event_mutex); + if (mod) { + mutex_lock(&trace_types_lock); + /* + * It is safest to reset the ring buffer if the module + * being unloaded registered any events that were + * used. The only worry is if a new module gets + * loaded, and takes on the same id as the events of + * this module. When printing out the buffer, traced + * events left over from this module may be passed to + * the new module events and unexpected results may + * occur. + */ + tracing_reset_all_online_cpus(); + mutex_unlock(&trace_types_lock); + } + return ret; } EXPORT_SYMBOL_GPL(synth_event_delete); @@ -1420,7 +1757,41 @@ int synth_event_run_command(const char *command) return trace_run_command(command, create_or_delete_synth_event); } -static int synth_event_create(int argc, const char **argv) +static int synth_event_run_cmd(struct dynevent_cmd *cmd) +{ + struct synth_event *se; + int ret; + + ret = trace_run_command(cmd->buf, create_or_delete_synth_event); + if (ret) + return ret; + + se = find_synth_event(cmd->event_name); + if (WARN_ON(!se)) + return -ENOENT; + + se->mod = cmd->private_data; + + return ret; +} + +/** + * synth_event_cmd_init - Initialize a synthetic event command object + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @buf: A pointer to the buffer used to build the command + * @maxlen: The length of the buffer passed in @buf + * + * Initialize a synthetic event command object. Use this before + * calling any of the other dyenvent_cmd functions. + */ +void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) +{ + dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH, + synth_event_run_cmd); +} +EXPORT_SYMBOL_GPL(synth_event_cmd_init); + +static int create_synth_event(int argc, const char **argv) { const char *name = argv[0]; int len; -- cgit v1.2.3 From 8dcc53ad956d2caf4c5c2dda196e6801b71a3154 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 29 Jan 2020 12:59:27 -0600 Subject: tracing: Add synth_event_trace() and related functions Add an exported function named synth_event_trace(), allowing modules or other kernel code to trace synthetic events. Also added are several functions that allow the same functionality to be broken out in a piecewise fashion, which are useful in situations where tracing an event from a full array of values would be cumbersome. Those functions are synth_event_trace_start/end() and synth_event_add_(next)_val(). Link: http://lkml.kernel.org/r/7a84de5f1854acf4144b57efe835ca645afa764f.1580323897.git.zanussi@kernel.org Acked-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 463 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 463 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5a910bb193e9..4d56a4f0310d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -398,6 +398,7 @@ struct synth_field { char *type; char *name; size_t size; + unsigned int offset; bool is_signed; bool is_string; }; @@ -668,6 +669,8 @@ static int synth_event_define_fields(struct trace_event_call *call) if (ret) break; + event->fields[i]->offset = n_u64; + if (event->fields[i]->is_string) { offset += STR_VAR_LEN_MAX; n_u64 += STR_VAR_LEN_MAX / sizeof(u64); @@ -1791,6 +1794,466 @@ void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) } EXPORT_SYMBOL_GPL(synth_event_cmd_init); +/** + * synth_event_trace - Trace a synthetic event + * @file: The trace_event_file representing the synthetic event + * @n_vals: The number of values in vals + * @args: Variable number of args containing the event values + * + * Trace a synthetic event using the values passed in the variable + * argument list. + * + * The argument list should be a list 'n_vals' u64 values. The number + * of vals must match the number of field in the synthetic event, and + * must be in the same order as the synthetic event fields. + * + * All vals should be cast to u64, and string vals are just pointers + * to strings, cast to u64. Strings will be copied into space + * reserved in the event for the string, using these pointers. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) +{ + struct trace_event_buffer fbuffer; + struct synth_trace_event *entry; + struct trace_buffer *buffer; + struct synth_event *event; + unsigned int i, n_u64; + int fields_size = 0; + va_list args; + int ret = 0; + + /* + * Normal event generation doesn't get called at all unless + * the ENABLED bit is set (which attaches the probe thus + * allowing this code to be called, etc). Because this is + * called directly by the user, we don't have that but we + * still need to honor not logging when disabled. + */ + if (!(file->flags & EVENT_FILE_FL_ENABLED)) + return 0; + + event = file->event_call->data; + + if (n_vals != event->n_fields) + return -EINVAL; + + if (trace_trigger_soft_disabled(file)) + return -EINVAL; + + fields_size = event->n_u64 * sizeof(u64); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + buffer = file->tr->array_buffer.buffer; + ring_buffer_nest_start(buffer); + + entry = trace_event_buffer_reserve(&fbuffer, file, + sizeof(*entry) + fields_size); + if (!entry) { + ret = -EINVAL; + goto out; + } + + va_start(args, n_vals); + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + u64 val; + + val = va_arg(args, u64); + + if (event->fields[i]->is_string) { + char *str_val = (char *)(long)val; + char *str_field = (char *)&entry->fields[n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + entry->fields[n_u64] = val; + n_u64++; + } + } + va_end(args); + + trace_event_buffer_commit(&fbuffer); +out: + ring_buffer_nest_end(buffer); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace); + +/** + * synth_event_trace_array - Trace a synthetic event from an array + * @file: The trace_event_file representing the synthetic event + * @vals: Array of values + * @n_vals: The number of values in vals + * + * Trace a synthetic event using the values passed in as 'vals'. + * + * The 'vals' array is just an array of 'n_vals' u64. The number of + * vals must match the number of field in the synthetic event, and + * must be in the same order as the synthetic event fields. + * + * All vals should be cast to u64, and string vals are just pointers + * to strings, cast to u64. Strings will be copied into space + * reserved in the event for the string, using these pointers. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_array(struct trace_event_file *file, u64 *vals, + unsigned int n_vals) +{ + struct trace_event_buffer fbuffer; + struct synth_trace_event *entry; + struct trace_buffer *buffer; + struct synth_event *event; + unsigned int i, n_u64; + int fields_size = 0; + int ret = 0; + + /* + * Normal event generation doesn't get called at all unless + * the ENABLED bit is set (which attaches the probe thus + * allowing this code to be called, etc). Because this is + * called directly by the user, we don't have that but we + * still need to honor not logging when disabled. + */ + if (!(file->flags & EVENT_FILE_FL_ENABLED)) + return 0; + + event = file->event_call->data; + + if (n_vals != event->n_fields) + return -EINVAL; + + if (trace_trigger_soft_disabled(file)) + return -EINVAL; + + fields_size = event->n_u64 * sizeof(u64); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + buffer = file->tr->array_buffer.buffer; + ring_buffer_nest_start(buffer); + + entry = trace_event_buffer_reserve(&fbuffer, file, + sizeof(*entry) + fields_size); + if (!entry) { + ret = -EINVAL; + goto out; + } + + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + if (event->fields[i]->is_string) { + char *str_val = (char *)(long)vals[i]; + char *str_field = (char *)&entry->fields[n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + entry->fields[n_u64] = vals[i]; + n_u64++; + } + } + + trace_event_buffer_commit(&fbuffer); +out: + ring_buffer_nest_end(buffer); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace_array); + +/** + * synth_event_trace_start - Start piecewise synthetic event trace + * @file: The trace_event_file representing the synthetic event + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Start the trace of a synthetic event field-by-field rather than all + * at once. + * + * This function 'opens' an event trace, which means space is reserved + * for the event in the trace buffer, after which the event's + * individual field values can be set through either + * synth_event_add_next_val() or synth_event_add_val(). + * + * A pointer to a trace_state object is passed in, which will keep + * track of the current event trace state until the event trace is + * closed (and the event finally traced) using + * synth_event_trace_end(). + * + * Note that synth_event_trace_end() must be called after all values + * have been added for each event trace, regardless of whether adding + * all field values succeeded or not. + * + * Note also that for a given event trace, all fields must be added + * using either synth_event_add_next_val() or synth_event_add_val() + * but not both together or interleaved. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) +{ + struct synth_trace_event *entry; + int fields_size = 0; + int ret = 0; + + if (!trace_state) { + ret = -EINVAL; + goto out; + } + + memset(trace_state, '\0', sizeof(*trace_state)); + + /* + * Normal event tracing doesn't get called at all unless the + * ENABLED bit is set (which attaches the probe thus allowing + * this code to be called, etc). Because this is called + * directly by the user, we don't have that but we still need + * to honor not logging when disabled. For the the iterated + * trace case, we save the enabed state upon start and just + * ignore the following data calls. + */ + if (!(file->flags & EVENT_FILE_FL_ENABLED)) { + trace_state->enabled = false; + goto out; + } + + trace_state->enabled = true; + + trace_state->event = file->event_call->data; + + if (trace_trigger_soft_disabled(file)) { + ret = -EINVAL; + goto out; + } + + fields_size = trace_state->event->n_u64 * sizeof(u64); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + trace_state->buffer = file->tr->array_buffer.buffer; + ring_buffer_nest_start(trace_state->buffer); + + entry = trace_event_buffer_reserve(&trace_state->fbuffer, file, + sizeof(*entry) + fields_size); + if (!entry) { + ret = -EINVAL; + goto out; + } + + trace_state->entry = entry; +out: + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace_start); + +static int save_synth_val(struct synth_field *field, u64 val, + struct synth_event_trace_state *trace_state) +{ + struct synth_trace_event *entry = trace_state->entry; + + if (field->is_string) { + char *str_val = (char *)(long)val; + char *str_field; + + if (!str_val) + return -EINVAL; + + str_field = (char *)&entry->fields[field->offset]; + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + } else + entry->fields[field->offset] = val; + + return 0; +} + +/** + * synth_event_add_next_val - Add the next field's value to an open synth trace + * @val: The value to set the next field to + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Set the value of the next field in an event that's been opened by + * synth_event_trace_start(). + * + * The val param should be the value cast to u64. If the value points + * to a string, the val param should be a char * cast to u64. + * + * This function assumes all the fields in an event are to be set one + * after another - successive calls to this function are made, one for + * each field, in the order of the fields in the event, until all + * fields have been set. If you'd rather set each field individually + * without regard to ordering, synth_event_add_val() can be used + * instead. + * + * Note however that synth_event_add_next_val() and + * synth_event_add_val() can't be intermixed for a given event trace - + * one or the other but not both can be used at the same time. + * + * Note also that synth_event_trace_end() must be called after all + * values have been added for each event trace, regardless of whether + * adding all field values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_add_next_val(u64 val, + struct synth_event_trace_state *trace_state) +{ + struct synth_field *field; + struct synth_event *event; + int ret = 0; + + if (!trace_state) { + ret = -EINVAL; + goto out; + } + + /* can't mix add_next_synth_val() with add_synth_val() */ + if (trace_state->add_name) { + ret = -EINVAL; + goto out; + } + trace_state->add_next = true; + + if (!trace_state->enabled) + goto out; + + event = trace_state->event; + + if (trace_state->cur_field >= event->n_fields) { + ret = -EINVAL; + goto out; + } + + field = event->fields[trace_state->cur_field++]; + ret = save_synth_val(field, val, trace_state); + out: + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_next_val); + +static struct synth_field *find_synth_field(struct synth_event *event, + const char *field_name) +{ + struct synth_field *field = NULL; + unsigned int i; + + for (i = 0; i < event->n_fields; i++) { + field = event->fields[i]; + if (strcmp(field->name, field_name) == 0) + return field; + } + + return NULL; +} + +/** + * synth_event_add_val - Add a named field's value to an open synth trace + * @field_name: The name of the synthetic event field value to set + * @val: The value to set the next field to + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Set the value of the named field in an event that's been opened by + * synth_event_trace_start(). + * + * The val param should be the value cast to u64. If the value points + * to a string, the val param should be a char * cast to u64. + * + * This function looks up the field name, and if found, sets the field + * to the specified value. This lookup makes this function more + * expensive than synth_event_add_next_val(), so use that or the + * none-piecewise synth_event_trace() instead if efficiency is more + * important. + * + * Note however that synth_event_add_next_val() and + * synth_event_add_val() can't be intermixed for a given event trace - + * one or the other but not both can be used at the same time. + * + * Note also that synth_event_trace_end() must be called after all + * values have been added for each event trace, regardless of whether + * adding all field values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_add_val(const char *field_name, u64 val, + struct synth_event_trace_state *trace_state) +{ + struct synth_trace_event *entry; + struct synth_event *event; + struct synth_field *field; + int ret = 0; + + if (!trace_state) { + ret = -EINVAL; + goto out; + } + + /* can't mix add_next_synth_val() with add_synth_val() */ + if (trace_state->add_next) { + ret = -EINVAL; + goto out; + } + trace_state->add_name = true; + + if (!trace_state->enabled) + goto out; + + event = trace_state->event; + entry = trace_state->entry; + + field = find_synth_field(event, field_name); + if (!field) { + ret = -EINVAL; + goto out; + } + + ret = save_synth_val(field, val, trace_state); + out: + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_val); + +/** + * synth_event_trace_end - End piecewise synthetic event trace + * @trace_state: A pointer to object tracking the piecewise trace state + * + * End the trace of a synthetic event opened by + * synth_event_trace__start(). + * + * This function 'closes' an event trace, which basically means that + * it commits the reserved event and cleans up other loose ends. + * + * A pointer to a trace_state object is passed in, which will keep + * track of the current event trace state opened with + * synth_event_trace_start(). + * + * Note that this function must be called after all values have been + * added for each event trace, regardless of whether adding all field + * values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_end(struct synth_event_trace_state *trace_state) +{ + if (!trace_state) + return -EINVAL; + + trace_event_buffer_commit(&trace_state->fbuffer); + + ring_buffer_nest_end(trace_state->buffer); + + return 0; +} +EXPORT_SYMBOL_GPL(synth_event_trace_end); + static int create_synth_event(int argc, const char **argv) { const char *name = argv[0]; -- cgit v1.2.3 From 9fe41efaca08416657efa8731c0d47ccb6a3f3eb Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 29 Jan 2020 12:59:28 -0600 Subject: tracing: Add synth event generation test module Add a test module that checks the basic functionality of the in-kernel synthetic event generation API by generating and tracing synthetic events from a module. Link: http://lkml.kernel.org/r/fcb4dd9eb9eefb70ab20538d3529d51642389664.1580323897.git.zanussi@kernel.org Acked-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 13 + kernel/trace/Makefile | 1 + kernel/trace/synth_event_gen_test.c | 523 ++++++++++++++++++++++++++++++++++++ 3 files changed, 537 insertions(+) create mode 100644 kernel/trace/synth_event_gen_test.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 75326d8ab1af..4f2041166a2f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -774,6 +774,19 @@ config PREEMPTIRQ_DELAY_TEST If unsure, say N +config SYNTH_EVENT_GEN_TEST + tristate "Test module for in-kernel synthetic event generation" + depends on HIST_TRIGGERS + help + This option creates a test module to check the base + functionality of in-kernel synthetic event definition and + generation. + + To test, insert the module, and then check the trace buffer + for the generated sample events. + + If unsure, say N. + config TRACE_EVAL_MAP_FILE bool "Show eval mappings for trace events" depends on TRACING diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 395e2db9c742..32012f50fb79 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_TRACING_MAP) += tracing_map.o obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o +obj-$(CONFIG_SYNTH_EVENT_GEN_TEST) += synth_event_gen_test.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_PREEMPTIRQ_TRACEPOINTS) += trace_preemptirq.o diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c new file mode 100644 index 000000000000..4aefe003cb7c --- /dev/null +++ b/kernel/trace/synth_event_gen_test.c @@ -0,0 +1,523 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test module for in-kernel sythetic event creation and generation. + * + * Copyright (C) 2019 Tom Zanussi + */ + +#include +#include + +/* + * This module is a simple test of basic functionality for in-kernel + * synthetic event creation and generation, the first and second tests + * using synth_event_gen_cmd_start() and synth_event_add_field(), the + * third uses synth_event_create() to do it all at once with a static + * field array. + * + * Following that are a few examples using the created events to test + * various ways of tracing a synthetic event. + * + * To test, select CONFIG_SYNTH_EVENT_GEN_TEST and build the module. + * Then: + * + * # insmod kernel/trace/synth_event_gen_test.ko + * # cat /sys/kernel/debug/tracing/trace + * + * You should see several events in the trace buffer - + * "create_synth_test", "empty_synth_test", and several instances of + * "gen_synth_test". + * + * To remove the events, remove the module: + * + * # rmmod synth_event_gen_test + * + */ + +static struct trace_event_file *create_synth_test; +static struct trace_event_file *empty_synth_test; +static struct trace_event_file *gen_synth_test; + +/* + * Test to make sure we can create a synthetic event, then add more + * fields. + */ +static int __init test_gen_synth_cmd(void) +{ + struct dynevent_cmd cmd; + u64 vals[7]; + char *buf; + int ret; + + /* Create a buffer to hold the generated command */ + buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* Before generating the command, initialize the cmd object */ + synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); + + /* + * Create the empty gen_synth_test synthetic event with the + * first 4 fields. + */ + ret = synth_event_gen_cmd_start(&cmd, "gen_synth_test", THIS_MODULE, + "pid_t", "next_pid_field", + "char[16]", "next_comm_field", + "u64", "ts_ns", + "u64", "ts_ms"); + if (ret) + goto free; + + /* Use synth_event_add_field to add the rest of the fields */ + + ret = synth_event_add_field(&cmd, "unsigned int", "cpu"); + if (ret) + goto free; + + ret = synth_event_add_field(&cmd, "char[64]", "my_string_field"); + if (ret) + goto free; + + ret = synth_event_add_field(&cmd, "int", "my_int_field"); + if (ret) + goto free; + + ret = synth_event_gen_cmd_end(&cmd); + if (ret) + goto free; + + /* + * Now get the gen_synth_test event file. We need to prevent + * the instance and event from disappearing from underneath + * us, which trace_get_event_file() does (though in this case + * we're using the top-level instance which never goes away). + */ + gen_synth_test = trace_get_event_file(NULL, "synthetic", + "gen_synth_test"); + if (IS_ERR(gen_synth_test)) { + ret = PTR_ERR(gen_synth_test); + goto delete; + } + + /* Enable the event or you won't see anything */ + ret = trace_array_set_clr_event(gen_synth_test->tr, + "synthetic", "gen_synth_test", true); + if (ret) { + trace_put_event_file(gen_synth_test); + goto delete; + } + + /* Create some bogus values just for testing */ + + vals[0] = 777; /* next_pid_field */ + vals[1] = (u64)"hula hoops"; /* next_comm_field */ + vals[2] = 1000000; /* ts_ns */ + vals[3] = 1000; /* ts_ms */ + vals[4] = smp_processor_id(); /* cpu */ + vals[5] = (u64)"thneed"; /* my_string_field */ + vals[6] = 598; /* my_int_field */ + + /* Now generate a gen_synth_test event */ + ret = synth_event_trace_array(gen_synth_test, vals, ARRAY_SIZE(vals)); + out: + return ret; + delete: + /* We got an error after creating the event, delete it */ + synth_event_delete("gen_synth_test"); + free: + kfree(buf); + + goto out; +} + +/* + * Test to make sure we can create an initially empty synthetic event, + * then add all the fields. + */ +static int __init test_empty_synth_event(void) +{ + struct dynevent_cmd cmd; + u64 vals[7]; + char *buf; + int ret; + + /* Create a buffer to hold the generated command */ + buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* Before generating the command, initialize the cmd object */ + synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); + + /* + * Create the empty_synth_test synthetic event with no fields. + */ + ret = synth_event_gen_cmd_start(&cmd, "empty_synth_test", THIS_MODULE); + if (ret) + goto free; + + /* Use synth_event_add_field to add all of the fields */ + + ret = synth_event_add_field(&cmd, "pid_t", "next_pid_field"); + if (ret) + goto free; + + ret = synth_event_add_field(&cmd, "char[16]", "next_comm_field"); + if (ret) + goto free; + + ret = synth_event_add_field(&cmd, "u64", "ts_ns"); + if (ret) + goto free; + + ret = synth_event_add_field(&cmd, "u64", "ts_ms"); + if (ret) + goto free; + + ret = synth_event_add_field(&cmd, "unsigned int", "cpu"); + if (ret) + goto free; + + ret = synth_event_add_field(&cmd, "char[64]", "my_string_field"); + if (ret) + goto free; + + ret = synth_event_add_field(&cmd, "int", "my_int_field"); + if (ret) + goto free; + + /* All fields have been added, close and register the synth event */ + + ret = synth_event_gen_cmd_end(&cmd); + if (ret) + goto free; + + /* + * Now get the empty_synth_test event file. We need to + * prevent the instance and event from disappearing from + * underneath us, which trace_get_event_file() does (though in + * this case we're using the top-level instance which never + * goes away). + */ + empty_synth_test = trace_get_event_file(NULL, "synthetic", + "empty_synth_test"); + if (IS_ERR(empty_synth_test)) { + ret = PTR_ERR(empty_synth_test); + goto delete; + } + + /* Enable the event or you won't see anything */ + ret = trace_array_set_clr_event(empty_synth_test->tr, + "synthetic", "empty_synth_test", true); + if (ret) { + trace_put_event_file(empty_synth_test); + goto delete; + } + + /* Create some bogus values just for testing */ + + vals[0] = 777; /* next_pid_field */ + vals[1] = (u64)"tiddlywinks"; /* next_comm_field */ + vals[2] = 1000000; /* ts_ns */ + vals[3] = 1000; /* ts_ms */ + vals[4] = smp_processor_id(); /* cpu */ + vals[5] = (u64)"thneed_2.0"; /* my_string_field */ + vals[6] = 399; /* my_int_field */ + + /* Now trace an empty_synth_test event */ + ret = synth_event_trace_array(empty_synth_test, vals, ARRAY_SIZE(vals)); + out: + return ret; + delete: + /* We got an error after creating the event, delete it */ + synth_event_delete("empty_synth_test"); + free: + kfree(buf); + + goto out; +} + +static struct synth_field_desc create_synth_test_fields[] = { + { .type = "pid_t", .name = "next_pid_field" }, + { .type = "char[16]", .name = "next_comm_field" }, + { .type = "u64", .name = "ts_ns" }, + { .type = "u64", .name = "ts_ms" }, + { .type = "unsigned int", .name = "cpu" }, + { .type = "char[64]", .name = "my_string_field" }, + { .type = "int", .name = "my_int_field" }, +}; + +/* + * Test synthetic event creation all at once from array of field + * descriptors. + */ +static int __init test_create_synth_event(void) +{ + u64 vals[7]; + int ret; + + /* Create the create_synth_test event with the fields above */ + ret = synth_event_create("create_synth_test", + create_synth_test_fields, + ARRAY_SIZE(create_synth_test_fields), + THIS_MODULE); + if (ret) + goto out; + + /* + * Now get the create_synth_test event file. We need to + * prevent the instance and event from disappearing from + * underneath us, which trace_get_event_file() does (though in + * this case we're using the top-level instance which never + * goes away). + */ + create_synth_test = trace_get_event_file(NULL, "synthetic", + "create_synth_test"); + if (IS_ERR(create_synth_test)) { + ret = PTR_ERR(create_synth_test); + goto delete; + } + + /* Enable the event or you won't see anything */ + ret = trace_array_set_clr_event(create_synth_test->tr, + "synthetic", "create_synth_test", true); + if (ret) { + trace_put_event_file(create_synth_test); + goto delete; + } + + /* Create some bogus values just for testing */ + + vals[0] = 777; /* next_pid_field */ + vals[1] = (u64)"tiddlywinks"; /* next_comm_field */ + vals[2] = 1000000; /* ts_ns */ + vals[3] = 1000; /* ts_ms */ + vals[4] = smp_processor_id(); /* cpu */ + vals[5] = (u64)"thneed"; /* my_string_field */ + vals[6] = 398; /* my_int_field */ + + /* Now generate a create_synth_test event */ + ret = synth_event_trace_array(create_synth_test, vals, ARRAY_SIZE(vals)); + out: + return ret; + delete: + /* We got an error after creating the event, delete it */ + ret = synth_event_delete("create_synth_test"); + + goto out; +} + +/* + * Test tracing a synthetic event by reserving trace buffer space, + * then filling in fields one after another. + */ +static int __init test_add_next_synth_val(void) +{ + struct synth_event_trace_state trace_state; + int ret; + + /* Start by reserving space in the trace buffer */ + ret = synth_event_trace_start(gen_synth_test, &trace_state); + if (ret) + return ret; + + /* Write some bogus values into the trace buffer, one after another */ + + /* next_pid_field */ + ret = synth_event_add_next_val(777, &trace_state); + if (ret) + goto out; + + /* next_comm_field */ + ret = synth_event_add_next_val((u64)"slinky", &trace_state); + if (ret) + goto out; + + /* ts_ns */ + ret = synth_event_add_next_val(1000000, &trace_state); + if (ret) + goto out; + + /* ts_ms */ + ret = synth_event_add_next_val(1000, &trace_state); + if (ret) + goto out; + + /* cpu */ + ret = synth_event_add_next_val(smp_processor_id(), &trace_state); + if (ret) + goto out; + + /* my_string_field */ + ret = synth_event_add_next_val((u64)"thneed_2.01", &trace_state); + if (ret) + goto out; + + /* my_int_field */ + ret = synth_event_add_next_val(395, &trace_state); + out: + /* Finally, commit the event */ + ret = synth_event_trace_end(&trace_state); + + return ret; +} + +/* + * Test tracing a synthetic event by reserving trace buffer space, + * then filling in fields using field names, which can be done in any + * order. + */ +static int __init test_add_synth_val(void) +{ + struct synth_event_trace_state trace_state; + int ret; + + /* Start by reserving space in the trace buffer */ + ret = synth_event_trace_start(gen_synth_test, &trace_state); + if (ret) + return ret; + + /* Write some bogus values into the trace buffer, using field names */ + + ret = synth_event_add_val("ts_ns", 1000000, &trace_state); + if (ret) + goto out; + + ret = synth_event_add_val("ts_ms", 1000, &trace_state); + if (ret) + goto out; + + ret = synth_event_add_val("cpu", smp_processor_id(), &trace_state); + if (ret) + goto out; + + ret = synth_event_add_val("next_pid_field", 777, &trace_state); + if (ret) + goto out; + + ret = synth_event_add_val("next_comm_field", (u64)"silly putty", + &trace_state); + if (ret) + goto out; + + ret = synth_event_add_val("my_string_field", (u64)"thneed_9", + &trace_state); + if (ret) + goto out; + + ret = synth_event_add_val("my_int_field", 3999, &trace_state); + out: + /* Finally, commit the event */ + ret = synth_event_trace_end(&trace_state); + + return ret; +} + +/* + * Test tracing a synthetic event all at once from array of values. + */ +static int __init test_trace_synth_event(void) +{ + int ret; + + /* Trace some bogus values just for testing */ + ret = synth_event_trace(create_synth_test, 7, /* number of values */ + 444, /* next_pid_field */ + (u64)"clackers", /* next_comm_field */ + 1000000, /* ts_ns */ + 1000, /* ts_ms */ + smp_processor_id(), /* cpu */ + (u64)"Thneed", /* my_string_field */ + 999); /* my_int_field */ + return ret; +} + +static int __init synth_event_gen_test_init(void) +{ + int ret; + + ret = test_gen_synth_cmd(); + if (ret) + return ret; + + ret = test_empty_synth_event(); + if (ret) { + WARN_ON(trace_array_set_clr_event(gen_synth_test->tr, + "synthetic", + "gen_synth_test", false)); + trace_put_event_file(gen_synth_test); + WARN_ON(synth_event_delete("gen_synth_test")); + goto out; + } + + ret = test_create_synth_event(); + if (ret) { + WARN_ON(trace_array_set_clr_event(gen_synth_test->tr, + "synthetic", + "gen_synth_test", false)); + trace_put_event_file(gen_synth_test); + WARN_ON(synth_event_delete("gen_synth_test")); + + WARN_ON(trace_array_set_clr_event(empty_synth_test->tr, + "synthetic", + "empty_synth_test", false)); + trace_put_event_file(empty_synth_test); + WARN_ON(synth_event_delete("empty_synth_test")); + goto out; + } + + ret = test_add_next_synth_val(); + WARN_ON(ret); + + ret = test_add_synth_val(); + WARN_ON(ret); + + ret = test_trace_synth_event(); + WARN_ON(ret); + out: + return ret; +} + +static void __exit synth_event_gen_test_exit(void) +{ + /* Disable the event or you can't remove it */ + WARN_ON(trace_array_set_clr_event(gen_synth_test->tr, + "synthetic", + "gen_synth_test", false)); + + /* Now give the file and instance back */ + trace_put_event_file(gen_synth_test); + + /* Now unregister and free the synthetic event */ + WARN_ON(synth_event_delete("gen_synth_test")); + + /* Disable the event or you can't remove it */ + WARN_ON(trace_array_set_clr_event(empty_synth_test->tr, + "synthetic", + "empty_synth_test", false)); + + /* Now give the file and instance back */ + trace_put_event_file(empty_synth_test); + + /* Now unregister and free the synthetic event */ + WARN_ON(synth_event_delete("empty_synth_test")); + + /* Disable the event or you can't remove it */ + WARN_ON(trace_array_set_clr_event(create_synth_test->tr, + "synthetic", + "create_synth_test", false)); + + /* Now give the file and instance back */ + trace_put_event_file(create_synth_test); + + /* Now unregister and free the synthetic event */ + WARN_ON(synth_event_delete("create_synth_test")); +} + +module_init(synth_event_gen_test_init) +module_exit(synth_event_gen_test_exit) + +MODULE_AUTHOR("Tom Zanussi"); +MODULE_DESCRIPTION("synthetic event generation test"); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 2a588dd1d5d649a183a2ff6fa1b80e870cf821d8 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 29 Jan 2020 12:59:29 -0600 Subject: tracing: Add kprobe event command generation functions Add functions used to generate kprobe event commands, built on top of the dynevent_cmd interface. kprobe_event_gen_cmd_start() is used to create a kprobe event command using a variable arg list, and kretprobe_event_gen_cmd_start() does the same for kretprobe event commands. kprobe_event_add_fields() can be used to add single fields one by one or as a group. Once all desired fields are added, kprobe_event_gen_cmd_end() or kretprobe_event_gen_cmd_end() respectively are used to actually execute the command and create the event. Link: http://lkml.kernel.org/r/95cc4696502bb6017f9126f306a45ad19b4cc14f.1580323897.git.zanussi@kernel.org Acked-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 161 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index bf20cd7f2666..f43548b466d0 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -906,6 +906,167 @@ int trace_kprobe_run_command(const char *command) return trace_run_command(command, create_or_delete_trace_kprobe); } +static int trace_kprobe_run_cmd(struct dynevent_cmd *cmd) +{ + return trace_run_command(cmd->buf, create_or_delete_trace_kprobe); +} + +/** + * kprobe_event_cmd_init - Initialize a kprobe event command object + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @buf: A pointer to the buffer used to build the command + * @maxlen: The length of the buffer passed in @buf + * + * Initialize a synthetic event command object. Use this before + * calling any of the other kprobe_event functions. + */ +void kprobe_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) +{ + dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_KPROBE, + trace_kprobe_run_cmd); +} +EXPORT_SYMBOL_GPL(kprobe_event_cmd_init); + +/** + * __kprobe_event_gen_cmd_start - Generate a kprobe event command from arg list + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @name: The name of the kprobe event + * @loc: The location of the kprobe event + * @kretprobe: Is this a return probe? + * @args: Variable number of arg (pairs), one pair for each field + * + * NOTE: Users normally won't want to call this function directly, but + * rather use the kprobe_event_gen_cmd_start() wrapper, which automatically + * adds a NULL to the end of the arg list. If this function is used + * directly, make sure the last arg in the variable arg list is NULL. + * + * Generate a kprobe event command to be executed by + * kprobe_event_gen_cmd_end(). This function can be used to generate the + * complete command or only the first part of it; in the latter case, + * kprobe_event_add_fields() can be used to add more fields following this. + * + * Return: 0 if successful, error otherwise. + */ +int __kprobe_event_gen_cmd_start(struct dynevent_cmd *cmd, bool kretprobe, + const char *name, const char *loc, ...) +{ + char buf[MAX_EVENT_NAME_LEN]; + struct dynevent_arg arg; + va_list args; + int ret; + + if (cmd->type != DYNEVENT_TYPE_KPROBE) + return -EINVAL; + + if (kretprobe) + snprintf(buf, MAX_EVENT_NAME_LEN, "r:kprobes/%s", name); + else + snprintf(buf, MAX_EVENT_NAME_LEN, "p:kprobes/%s", name); + + ret = dynevent_str_add(cmd, buf); + if (ret) + return ret; + + dynevent_arg_init(&arg, NULL, 0); + arg.str = loc; + ret = dynevent_arg_add(cmd, &arg); + if (ret) + return ret; + + va_start(args, loc); + for (;;) { + const char *field; + + field = va_arg(args, const char *); + if (!field) + break; + + if (++cmd->n_fields > MAX_TRACE_ARGS) { + ret = -EINVAL; + break; + } + + arg.str = field; + ret = dynevent_arg_add(cmd, &arg); + if (ret) + break; + } + va_end(args); + + return ret; +} +EXPORT_SYMBOL_GPL(__kprobe_event_gen_cmd_start); + +/** + * __kprobe_event_add_fields - Add probe fields to a kprobe command from arg list + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @args: Variable number of arg (pairs), one pair for each field + * + * NOTE: Users normally won't want to call this function directly, but + * rather use the kprobe_event_add_fields() wrapper, which + * automatically adds a NULL to the end of the arg list. If this + * function is used directly, make sure the last arg in the variable + * arg list is NULL. + * + * Add probe fields to an existing kprobe command using a variable + * list of args. Fields are added in the same order they're listed. + * + * Return: 0 if successful, error otherwise. + */ +int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...) +{ + struct dynevent_arg arg; + va_list args; + int ret; + + if (cmd->type != DYNEVENT_TYPE_KPROBE) + return -EINVAL; + + dynevent_arg_init(&arg, NULL, 0); + + va_start(args, cmd); + for (;;) { + const char *field; + + field = va_arg(args, const char *); + if (!field) + break; + + if (++cmd->n_fields > MAX_TRACE_ARGS) { + ret = -EINVAL; + break; + } + + arg.str = field; + ret = dynevent_arg_add(cmd, &arg); + if (ret) + break; + } + va_end(args); + + return ret; +} +EXPORT_SYMBOL_GPL(__kprobe_event_add_fields); + +/** + * kprobe_event_delete - Delete a kprobe event + * @name: The name of the kprobe event to delete + * + * Delete a kprobe event with the give @name from kernel code rather + * than directly from the command line. + * + * Return: 0 if successful, error otherwise. + */ +int kprobe_event_delete(const char *name) +{ + char buf[MAX_EVENT_NAME_LEN]; + + snprintf(buf, MAX_EVENT_NAME_LEN, "-:%s", name); + + return trace_run_command(buf, create_or_delete_trace_kprobe); +} +EXPORT_SYMBOL_GPL(kprobe_event_delete); + static int trace_kprobe_release(struct dyn_event *ev) { struct trace_kprobe *tk = to_trace_kprobe(ev); -- cgit v1.2.3 From 29a15481054681fa2d450b60a6feea8e6ca6f511 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 29 Jan 2020 12:59:30 -0600 Subject: tracing: Change trace_boot to use kprobe_event interface Have trace_boot_add_kprobe_event() use the kprobe_event interface. Also, rename kprobe_event_run_cmd() to kprobe_event_run_command() now that trace_boot's version is gone. Link: http://lkml.kernel.org/r/af5429d11291ab1e9a85a0ff944af3b2bcf193c7.1580323897.git.zanussi@kernel.org Acked-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 35 +++++++++++++++-------------------- kernel/trace/trace_kprobe.c | 9 ++------- 2 files changed, 17 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 4d37bf5c3742..2298a70cdda6 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -88,37 +88,32 @@ trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node) } #ifdef CONFIG_KPROBE_EVENTS -extern int trace_kprobe_run_command(const char *command); - static int __init trace_boot_add_kprobe_event(struct xbc_node *node, const char *event) { + struct dynevent_cmd cmd; struct xbc_node *anode; char buf[MAX_BUF_LEN]; const char *val; - char *p; - int len; + int ret; - len = snprintf(buf, ARRAY_SIZE(buf) - 1, "p:kprobes/%s ", event); - if (len >= ARRAY_SIZE(buf)) { - pr_err("Event name is too long: %s\n", event); - return -E2BIG; - } - p = buf + len; - len = ARRAY_SIZE(buf) - len; + kprobe_event_cmd_init(&cmd, buf, MAX_BUF_LEN); + + ret = kprobe_event_gen_cmd_start(&cmd, event, NULL); + if (ret) + return ret; xbc_node_for_each_array_value(node, "probes", anode, val) { - if (strlcpy(p, val, len) >= len) { - pr_err("Probe definition is too long: %s\n", val); - return -E2BIG; - } - if (trace_kprobe_run_command(buf) < 0) { - pr_err("Failed to add probe: %s\n", buf); - return -EINVAL; - } + ret = kprobe_event_add_field(&cmd, val); + if (ret) + return ret; } - return 0; + ret = kprobe_event_gen_cmd_end(&cmd); + if (ret) + pr_err("Failed to add probe: %s\n", buf); + + return ret; } #else static inline int __init diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f43548b466d0..307abb724a71 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -901,12 +901,7 @@ static int create_or_delete_trace_kprobe(int argc, char **argv) return ret == -ECANCELED ? -EINVAL : ret; } -int trace_kprobe_run_command(const char *command) -{ - return trace_run_command(command, create_or_delete_trace_kprobe); -} - -static int trace_kprobe_run_cmd(struct dynevent_cmd *cmd) +static int trace_kprobe_run_command(struct dynevent_cmd *cmd) { return trace_run_command(cmd->buf, create_or_delete_trace_kprobe); } @@ -923,7 +918,7 @@ static int trace_kprobe_run_cmd(struct dynevent_cmd *cmd) void kprobe_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) { dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_KPROBE, - trace_kprobe_run_cmd); + trace_kprobe_run_command); } EXPORT_SYMBOL_GPL(kprobe_event_cmd_init); -- cgit v1.2.3 From 64836248dda20c8e7427b493f7e06d9bf8f58850 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 29 Jan 2020 12:59:31 -0600 Subject: tracing: Add kprobe event command generation test module Add a test module that checks the basic functionality of the in-kernel kprobe event command generation API by creating kprobe events from a module. Link: http://lkml.kernel.org/r/97e502b204f9dba948e3fa3a4315448298218787.1580323897.git.zanussi@kernel.org Acked-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 12 ++ kernel/trace/Makefile | 1 + kernel/trace/kprobe_event_gen_test.c | 225 +++++++++++++++++++++++++++++++++++ 3 files changed, 238 insertions(+) create mode 100644 kernel/trace/kprobe_event_gen_test.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 4f2041166a2f..4484e783f68d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -787,6 +787,18 @@ config SYNTH_EVENT_GEN_TEST If unsure, say N. +config KPROBE_EVENT_GEN_TEST + tristate "Test module for in-kernel kprobe event generation" + depends on KPROBE_EVENTS + help + This option creates a test module to check the base + functionality of in-kernel kprobe event definition. + + To test, insert the module, and then check the trace buffer + for the generated kprobe events. + + If unsure, say N. + config TRACE_EVAL_MAP_FILE bool "Show eval mappings for trace events" depends on TRACING diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 32012f50fb79..f9dcd19165fa 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -45,6 +45,7 @@ obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_TRACING_MAP) += tracing_map.o obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o obj-$(CONFIG_SYNTH_EVENT_GEN_TEST) += synth_event_gen_test.o +obj-$(CONFIG_KPROBE_EVENT_GEN_TEST) += kprobe_event_gen_test.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_PREEMPTIRQ_TRACEPOINTS) += trace_preemptirq.o diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c new file mode 100644 index 000000000000..18b0f1cbb947 --- /dev/null +++ b/kernel/trace/kprobe_event_gen_test.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test module for in-kernel kprobe event creation and generation. + * + * Copyright (C) 2019 Tom Zanussi + */ + +#include +#include + +/* + * This module is a simple test of basic functionality for in-kernel + * kprobe/kretprobe event creation. The first test uses + * kprobe_event_gen_cmd_start(), kprobe_event_add_fields() and + * kprobe_event_gen_cmd_end() to create a kprobe event, which is then + * enabled in order to generate trace output. The second creates a + * kretprobe event using kretprobe_event_gen_cmd_start() and + * kretprobe_event_gen_cmd_end(), and is also then enabled. + * + * To test, select CONFIG_KPROBE_EVENT_GEN_TEST and build the module. + * Then: + * + * # insmod kernel/trace/kprobe_event_gen_test.ko + * # cat /sys/kernel/debug/tracing/trace + * + * You should see many instances of the "gen_kprobe_test" and + * "gen_kretprobe_test" events in the trace buffer. + * + * To remove the events, remove the module: + * + * # rmmod kprobe_event_gen_test + * + */ + +static struct trace_event_file *gen_kprobe_test; +static struct trace_event_file *gen_kretprobe_test; + +/* + * Test to make sure we can create a kprobe event, then add more + * fields. + */ +static int __init test_gen_kprobe_cmd(void) +{ + struct dynevent_cmd cmd; + char *buf; + int ret; + + /* Create a buffer to hold the generated command */ + buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* Before generating the command, initialize the cmd object */ + kprobe_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); + + /* + * Define the gen_kprobe_test event with the first 2 kprobe + * fields. + */ + ret = kprobe_event_gen_cmd_start(&cmd, "gen_kprobe_test", + "do_sys_open", + "dfd=%ax", "filename=%dx"); + if (ret) + goto free; + + /* Use kprobe_event_add_fields to add the rest of the fields */ + + ret = kprobe_event_add_fields(&cmd, "flags=%cx", "mode=+4($stack)"); + if (ret) + goto free; + + /* + * This actually creates the event. + */ + ret = kprobe_event_gen_cmd_end(&cmd); + if (ret) + goto free; + + /* + * Now get the gen_kprobe_test event file. We need to prevent + * the instance and event from disappearing from underneath + * us, which trace_get_event_file() does (though in this case + * we're using the top-level instance which never goes away). + */ + gen_kprobe_test = trace_get_event_file(NULL, "kprobes", + "gen_kprobe_test"); + if (IS_ERR(gen_kprobe_test)) { + ret = PTR_ERR(gen_kprobe_test); + goto delete; + } + + /* Enable the event or you won't see anything */ + ret = trace_array_set_clr_event(gen_kprobe_test->tr, + "kprobes", "gen_kprobe_test", true); + if (ret) { + trace_put_event_file(gen_kprobe_test); + goto delete; + } + out: + return ret; + delete: + /* We got an error after creating the event, delete it */ + ret = kprobe_event_delete("gen_kprobe_test"); + free: + kfree(buf); + + goto out; +} + +/* + * Test to make sure we can create a kretprobe event. + */ +static int __init test_gen_kretprobe_cmd(void) +{ + struct dynevent_cmd cmd; + char *buf; + int ret; + + /* Create a buffer to hold the generated command */ + buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* Before generating the command, initialize the cmd object */ + kprobe_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); + + /* + * Define the kretprobe event. + */ + ret = kretprobe_event_gen_cmd_start(&cmd, "gen_kretprobe_test", + "do_sys_open", + "$retval"); + if (ret) + goto free; + + /* + * This actually creates the event. + */ + ret = kretprobe_event_gen_cmd_end(&cmd); + if (ret) + goto free; + + /* + * Now get the gen_kretprobe_test event file. We need to + * prevent the instance and event from disappearing from + * underneath us, which trace_get_event_file() does (though in + * this case we're using the top-level instance which never + * goes away). + */ + gen_kretprobe_test = trace_get_event_file(NULL, "kprobes", + "gen_kretprobe_test"); + if (IS_ERR(gen_kretprobe_test)) { + ret = PTR_ERR(gen_kretprobe_test); + goto delete; + } + + /* Enable the event or you won't see anything */ + ret = trace_array_set_clr_event(gen_kretprobe_test->tr, + "kprobes", "gen_kretprobe_test", true); + if (ret) { + trace_put_event_file(gen_kretprobe_test); + goto delete; + } + out: + return ret; + delete: + /* We got an error after creating the event, delete it */ + ret = kprobe_event_delete("gen_kretprobe_test"); + free: + kfree(buf); + + goto out; +} + +static int __init kprobe_event_gen_test_init(void) +{ + int ret; + + ret = test_gen_kprobe_cmd(); + if (ret) + return ret; + + ret = test_gen_kretprobe_cmd(); + if (ret) { + WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr, + "kprobes", + "gen_kretprobe_test", false)); + trace_put_event_file(gen_kretprobe_test); + WARN_ON(kprobe_event_delete("gen_kretprobe_test")); + } + + return ret; +} + +static void __exit kprobe_event_gen_test_exit(void) +{ + /* Disable the event or you can't remove it */ + WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr, + "kprobes", + "gen_kprobe_test", false)); + + /* Now give the file and instance back */ + trace_put_event_file(gen_kprobe_test); + + /* Now unregister and free the event */ + WARN_ON(kprobe_event_delete("gen_kprobe_test")); + + /* Disable the event or you can't remove it */ + WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr, + "kprobes", + "gen_kretprobe_test", false)); + + /* Now give the file and instance back */ + trace_put_event_file(gen_kretprobe_test); + + /* Now unregister and free the event */ + WARN_ON(kprobe_event_delete("gen_kretprobe_test")); +} + +module_init(kprobe_event_gen_test_init) +module_exit(kprobe_event_gen_test_exit) + +MODULE_AUTHOR("Tom Zanussi"); +MODULE_DESCRIPTION("kprobe event generation test"); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 61778cd70c1dcef082e5ee6781c7c0c4d7d5b576 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 29 Jan 2020 16:19:10 -0500 Subject: tracing: Move all function tracing configs together The features that depend on the function tracer were spread out through the tracing menu, pull them together as it is easier to manage. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 142 +++++++++++++++++++++++++-------------------------- 1 file changed, 71 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 4484e783f68d..32fcbc00753b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -172,6 +172,77 @@ config FUNCTION_GRAPH_TRACER the return value. This is done by setting the current return address on the current task structure into a stack of calls. +config DYNAMIC_FTRACE + bool "enable/disable function tracing dynamically" + depends on FUNCTION_TRACER + depends on HAVE_DYNAMIC_FTRACE + default y + help + This option will modify all the calls to function tracing + dynamically (will patch them out of the binary image and + replace them with a No-Op instruction) on boot up. During + compile time, a table is made of all the locations that ftrace + can function trace, and this table is linked into the kernel + image. When this is enabled, functions can be individually + enabled, and the functions not enabled will not affect + performance of the system. + + See the files in /sys/kernel/debug/tracing: + available_filter_functions + set_ftrace_filter + set_ftrace_notrace + + This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but + otherwise has native performance as long as no tracing is active. + +config DYNAMIC_FTRACE_WITH_REGS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_REGS + +config DYNAMIC_FTRACE_WITH_DIRECT_CALLS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + +config FUNCTION_PROFILER + bool "Kernel function profiler" + depends on FUNCTION_TRACER + default n + help + This option enables the kernel function profiler. A file is created + in debugfs called function_profile_enabled which defaults to zero. + When a 1 is echoed into this file profiling begins, and when a + zero is entered, profiling stops. A "functions" file is created in + the trace_stat directory; this file shows the list of functions that + have been hit and their counters. + + If in doubt, say N. + +config STACK_TRACER + bool "Trace max stack" + depends on HAVE_FUNCTION_TRACER + select FUNCTION_TRACER + select STACKTRACE + select KALLSYMS + help + This special tracer records the maximum stack footprint of the + kernel and displays it in /sys/kernel/debug/tracing/stack_trace. + + This tracer works by hooking into every function call that the + kernel executes, and keeping a maximum stack depth value and + stack-trace saved. If this is configured with DYNAMIC_FTRACE + then it will not have any overhead while the stack tracer + is disabled. + + To enable the stack tracer on bootup, pass in 'stacktrace' + on the kernel command line. + + The stack tracer can also be enabled or disabled via the + sysctl kernel.stack_tracer_enabled + + Say N if unsure. + config TRACE_PREEMPT_TOGGLE bool help @@ -410,30 +481,6 @@ config BRANCH_TRACER Say N if unsure. -config STACK_TRACER - bool "Trace max stack" - depends on HAVE_FUNCTION_TRACER - select FUNCTION_TRACER - select STACKTRACE - select KALLSYMS - help - This special tracer records the maximum stack footprint of the - kernel and displays it in /sys/kernel/debug/tracing/stack_trace. - - This tracer works by hooking into every function call that the - kernel executes, and keeping a maximum stack depth value and - stack-trace saved. If this is configured with DYNAMIC_FTRACE - then it will not have any overhead while the stack tracer - is disabled. - - To enable the stack tracer on bootup, pass in 'stacktrace' - on the kernel command line. - - The stack tracer can also be enabled or disabled via the - sysctl kernel.stack_tracer_enabled - - Say N if unsure. - config BLK_DEV_IO_TRACE bool "Support for tracing block IO actions" depends on SYSFS @@ -531,53 +578,6 @@ config DYNAMIC_EVENTS config PROBE_EVENTS def_bool n -config DYNAMIC_FTRACE - bool "enable/disable function tracing dynamically" - depends on FUNCTION_TRACER - depends on HAVE_DYNAMIC_FTRACE - default y - help - This option will modify all the calls to function tracing - dynamically (will patch them out of the binary image and - replace them with a No-Op instruction) on boot up. During - compile time, a table is made of all the locations that ftrace - can function trace, and this table is linked into the kernel - image. When this is enabled, functions can be individually - enabled, and the functions not enabled will not affect - performance of the system. - - See the files in /sys/kernel/debug/tracing: - available_filter_functions - set_ftrace_filter - set_ftrace_notrace - - This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but - otherwise has native performance as long as no tracing is active. - -config DYNAMIC_FTRACE_WITH_REGS - def_bool y - depends on DYNAMIC_FTRACE - depends on HAVE_DYNAMIC_FTRACE_WITH_REGS - -config DYNAMIC_FTRACE_WITH_DIRECT_CALLS - def_bool y - depends on DYNAMIC_FTRACE - depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS - -config FUNCTION_PROFILER - bool "Kernel function profiler" - depends on FUNCTION_TRACER - default n - help - This option enables the kernel function profiler. A file is created - in debugfs called function_profile_enabled which defaults to zero. - When a 1 is echoed into this file profiling begins, and when a - zero is entered, profiling stops. A "functions" file is created in - the trace_stat directory; this file shows the list of functions that - have been hit and their counters. - - If in doubt, say N. - config BPF_KPROBE_OVERRIDE bool "Enable BPF programs to override a kprobed function" depends on BPF_EVENTS -- cgit v1.2.3 From a48fc4f5f1d24c70906359731d7080f9ad78a1cb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 29 Jan 2020 16:23:04 -0500 Subject: tracing: Move tracing test module configs together The MMIO test module was by itself, move it to the other test modules. Also, add the text "Test module" to PREEMPTIRQ_DELAY_TEST as that create a test module as well. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 32fcbc00753b..47d0149347a9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -680,16 +680,6 @@ config TRACE_EVENT_INJECT If unsure, say N. -config MMIOTRACE_TEST - tristate "Test module for mmiotrace" - depends on MMIOTRACE && m - help - This is a dumb module for testing mmiotrace. It is very dangerous - as it will write garbage to IO memory starting at a given address. - However, it should be safe to use on e.g. unused portion of VRAM. - - Say N, unless you absolutely know what you are doing. - config TRACEPOINT_BENCHMARK bool "Add tracepoint that benchmarks tracepoints" help @@ -759,8 +749,18 @@ config RING_BUFFER_STARTUP_TEST If unsure, say N +config MMIOTRACE_TEST + tristate "Test module for mmiotrace" + depends on MMIOTRACE && m + help + This is a dumb module for testing mmiotrace. It is very dangerous + as it will write garbage to IO memory starting at a given address. + However, it should be safe to use on e.g. unused portion of VRAM. + + Say N, unless you absolutely know what you are doing. + config PREEMPTIRQ_DELAY_TEST - tristate "Preempt / IRQ disable delay thread to test latency tracers" + tristate "Test module to create a preempt / IRQ disable delay thread to test latency tracers" depends on m help Select this option to build a test module that can help test latency -- cgit v1.2.3 From 21b3ce3063be8eb7150a717d7c5286fb56ba8cea Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 29 Jan 2020 16:26:45 -0500 Subject: tracing: Move mmio tracer config up with the other tracers Move the config that enables the mmiotracer with the other tracers such that all the tracers are together. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 47d0149347a9..2014056682f5 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -353,6 +353,19 @@ config HWLAT_TRACER file. Every time a latency is greater than tracing_thresh, it will be recorded into the ring buffer. +config MMIOTRACE + bool "Memory mapped IO tracing" + depends on HAVE_MMIOTRACE_SUPPORT && PCI + select GENERIC_TRACER + help + Mmiotrace traces Memory Mapped I/O access and is meant for + debugging and reverse engineering. It is called from the ioremap + implementation and works via page faults. Tracing is disabled by + default and can be enabled at run-time. + + See Documentation/trace/mmiotrace.rst. + If you are not helping to develop drivers, say N. + config ENABLE_DEFAULT_TRACERS bool "Trace process context switches and events" depends on !GENERIC_TRACER @@ -627,19 +640,6 @@ config EVENT_TRACE_TEST_SYSCALLS TBD - enable a way to actually call the syscalls as we test their events -config MMIOTRACE - bool "Memory mapped IO tracing" - depends on HAVE_MMIOTRACE_SUPPORT && PCI - select GENERIC_TRACER - help - Mmiotrace traces Memory Mapped I/O access and is meant for - debugging and reverse engineering. It is called from the ioremap - implementation and works via page faults. Tracing is disabled by - default and can be enabled at run-time. - - See Documentation/trace/mmiotrace.rst. - If you are not helping to develop drivers, say N. - config TRACING_MAP bool depends on ARCH_HAVE_NMI_SAFE_CMPXCHG -- cgit v1.2.3 From 1e837945a8854227f3f4e4d2d7abff64ed320830 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 29 Jan 2020 16:30:30 -0500 Subject: tracing: Move tracing selftests to bottom of menu Move all the tracing selftest configs to the bottom of the tracing menu. There's no reason for them to be interspersed throughout. Also, move the bootconfig menu to the top. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 168 +++++++++++++++++++++++++-------------------------- 1 file changed, 84 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2014056682f5..91e885194dbc 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -141,6 +141,15 @@ menuconfig FTRACE if FTRACE +config BOOTTIME_TRACING + bool "Boot-time Tracing support" + depends on BOOT_CONFIG && TRACING + default y + help + Enable developer to setup ftrace subsystem via supplemental + kernel cmdline at boot time for debugging (tracing) driver + initialization and boot process. + config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER @@ -605,41 +614,6 @@ config FTRACE_MCOUNT_RECORD depends on DYNAMIC_FTRACE depends on HAVE_FTRACE_MCOUNT_RECORD -config FTRACE_SELFTEST - bool - -config FTRACE_STARTUP_TEST - bool "Perform a startup test on ftrace" - depends on GENERIC_TRACER - select FTRACE_SELFTEST - help - This option performs a series of startup tests on ftrace. On bootup - a series of tests are made to verify that the tracer is - functioning properly. It will do tests on all the configured - tracers of ftrace. - -config EVENT_TRACE_STARTUP_TEST - bool "Run selftest on trace events" - depends on FTRACE_STARTUP_TEST - default y - help - This option performs a test on all trace events in the system. - It basically just enables each event and runs some code that - will trigger events (not necessarily the event it enables) - This may take some time run as there are a lot of events. - -config EVENT_TRACE_TEST_SYSCALLS - bool "Run selftest on syscall events" - depends on EVENT_TRACE_STARTUP_TEST - help - This option will also enable testing every syscall event. - It only enables the event and disables it and runs various loads - with the event enabled. This adds a bit more time for kernel boot - up since it runs this on every system call defined. - - TBD - enable a way to actually call the syscalls as we test their - events - config TRACING_MAP bool depends on ARCH_HAVE_NMI_SAFE_CMPXCHG @@ -726,6 +700,81 @@ config RING_BUFFER_BENCHMARK If unsure, say N. +config TRACE_EVAL_MAP_FILE + bool "Show eval mappings for trace events" + depends on TRACING + help + The "print fmt" of the trace events will show the enum/sizeof names + instead of their values. This can cause problems for user space tools + that use this string to parse the raw data as user space does not know + how to convert the string to its value. + + To fix this, there's a special macro in the kernel that can be used + to convert an enum/sizeof into its value. If this macro is used, then + the print fmt strings will be converted to their values. + + If something does not get converted properly, this option can be + used to show what enums/sizeof the kernel tried to convert. + + This option is for debugging the conversions. A file is created + in the tracing directory called "eval_map" that will show the + names matched with their values and what trace event system they + belong too. + + Normally, the mapping of the strings to values will be freed after + boot up or module load. With this option, they will not be freed, as + they are needed for the "eval_map" file. Enabling this option will + increase the memory footprint of the running kernel. + + If unsure, say N. + +config GCOV_PROFILE_FTRACE + bool "Enable GCOV profiling on ftrace subsystem" + depends on GCOV_KERNEL + help + Enable GCOV profiling on ftrace subsystem for checking + which functions/lines are tested. + + If unsure, say N. + + Note that on a kernel compiled with this config, ftrace will + run significantly slower. + +config FTRACE_SELFTEST + bool + +config FTRACE_STARTUP_TEST + bool "Perform a startup test on ftrace" + depends on GENERIC_TRACER + select FTRACE_SELFTEST + help + This option performs a series of startup tests on ftrace. On bootup + a series of tests are made to verify that the tracer is + functioning properly. It will do tests on all the configured + tracers of ftrace. + +config EVENT_TRACE_STARTUP_TEST + bool "Run selftest on trace events" + depends on FTRACE_STARTUP_TEST + default y + help + This option performs a test on all trace events in the system. + It basically just enables each event and runs some code that + will trigger events (not necessarily the event it enables) + This may take some time run as there are a lot of events. + +config EVENT_TRACE_TEST_SYSCALLS + bool "Run selftest on syscall events" + depends on EVENT_TRACE_STARTUP_TEST + help + This option will also enable testing every syscall event. + It only enables the event and disables it and runs various loads + with the event enabled. This adds a bit more time for kernel boot + up since it runs this on every system call defined. + + TBD - enable a way to actually call the syscalls as we test their + events + config RING_BUFFER_STARTUP_TEST bool "Ring buffer startup self test" depends on RING_BUFFER @@ -799,55 +848,6 @@ config KPROBE_EVENT_GEN_TEST If unsure, say N. -config TRACE_EVAL_MAP_FILE - bool "Show eval mappings for trace events" - depends on TRACING - help - The "print fmt" of the trace events will show the enum/sizeof names - instead of their values. This can cause problems for user space tools - that use this string to parse the raw data as user space does not know - how to convert the string to its value. - - To fix this, there's a special macro in the kernel that can be used - to convert an enum/sizeof into its value. If this macro is used, then - the print fmt strings will be converted to their values. - - If something does not get converted properly, this option can be - used to show what enums/sizeof the kernel tried to convert. - - This option is for debugging the conversions. A file is created - in the tracing directory called "eval_map" that will show the - names matched with their values and what trace event system they - belong too. - - Normally, the mapping of the strings to values will be freed after - boot up or module load. With this option, they will not be freed, as - they are needed for the "eval_map" file. Enabling this option will - increase the memory footprint of the running kernel. - - If unsure, say N. - -config GCOV_PROFILE_FTRACE - bool "Enable GCOV profiling on ftrace subsystem" - depends on GCOV_KERNEL - help - Enable GCOV profiling on ftrace subsystem for checking - which functions/lines are tested. - - If unsure, say N. - - Note that on a kernel compiled with this config, ftrace will - run significantly slower. - -config BOOTTIME_TRACING - bool "Boot-time Tracing support" - depends on BOOT_CONFIG && TRACING - default y - help - Enable developer to setup ftrace subsystem via supplemental - kernel cmdline at boot time for debugging (tracing) driver - initialization and boot process. - endif # FTRACE endif # TRACING_SUPPORT -- cgit v1.2.3 From 0cd9d33ace336bc424fc30944aa3defd6786e4fe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 30 Jan 2020 11:37:33 -0500 Subject: cgroup: init_tasks shouldn't be linked to the root cgroup 5153faac18d2 ("cgroup: remove cgroup_enable_task_cg_lists() optimization") removed lazy initialization of css_sets so that new tasks are always lniked to its css_set. In the process, it incorrectly ended up adding init_tasks to root css_set. They show up as PID 0's in root's cgroup.procs triggering warnings in systemd and generally confusing people. Fix it by skip css_set linking for init_tasks. Signed-off-by: Tejun Heo Reported-by: https://github.com/joanbm Link: https://github.com/systemd/systemd/issues/14682 Fixes: 5153faac18d2 ("cgroup: remove cgroup_enable_task_cg_lists() optimization") Cc: stable@vger.kernel.org # v5.5+ --- kernel/cgroup/cgroup.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b3744872263e..cf8a36bdf5c8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5932,11 +5932,14 @@ void cgroup_post_fork(struct task_struct *child) spin_lock_irq(&css_set_lock); - WARN_ON_ONCE(!list_empty(&child->cg_list)); - cset = task_css_set(current); /* current is @child's parent */ - get_css_set(cset); - cset->nr_tasks++; - css_set_move_task(child, NULL, cset, false); + /* init tasks are special, only link regular threads */ + if (likely(child->pid)) { + WARN_ON_ONCE(!list_empty(&child->cg_list)); + cset = task_css_set(current); /* current is @child's parent */ + get_css_set(cset); + cset->nr_tasks++; + css_set_move_task(child, NULL, cset, false); + } /* * If the cgroup has to be frozen, the new task has too. Let's set -- cgit v1.2.3 From c67c10a67f6b2edcc7804317947cbfdeab71048c Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Sat, 9 Nov 2019 11:16:41 -0800 Subject: kdb: kdb_current_regs should be private As of the patch ("MIPS: kdb: Remove old workaround for backtracing on other CPUs") there is no reason for kdb_current_regs to be in the public "kdb.h". Let's move it next to kdb_current_task. Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20191109111623.2.Iadbfb484e90b557cc4b5ac9890bfca732cd99d77@changeid Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_private.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 55d052061ef9..e829b22f3946 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -242,6 +242,7 @@ extern void debug_kusage(void); extern void kdb_set_current_task(struct task_struct *); extern struct task_struct *kdb_current_task; +extern struct pt_regs *kdb_current_regs; #ifdef CONFIG_KDB_KEYBOARD extern void kdb_kbd_cleanup_state(void); -- cgit v1.2.3 From a8649fb0a8c1de9c842c9c492f189cabbc468272 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Sat, 9 Nov 2019 11:16:42 -0800 Subject: kdb: kdb_current_task shouldn't be exported The kdb_current_task variable has been declared in "kernel/debug/kdb/kdb_private.h" since 2010 when kdb was added to the mainline kernel. This is not a public header. There should be no reason that kdb_current_task should be exported and there are no in-kernel users that need it. Remove the export. Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20191109111623.3.I14b22b5eb15ca8f3812ab33e96621231304dc1f7@changeid Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 4567fe998c30..4d44b3746836 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -73,7 +73,6 @@ int kdb_nextline = 1; int kdb_state; /* General KDB state */ struct task_struct *kdb_current_task; -EXPORT_SYMBOL(kdb_current_task); struct pt_regs *kdb_current_regs; const char *kdb_diemsg; -- cgit v1.2.3 From 9441d5f6b77770ee388884f04b14a99b028a15e6 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Sat, 9 Nov 2019 11:16:43 -0800 Subject: kdb: Gid rid of implicit setting of the current task / regs Some (but not all?) of the kdb backtrace paths would cause the kdb_current_task and kdb_current_regs to remain changed. As discussed in a review of a previous patch [1], this doesn't seem intuitive, so let's fix that. ...but, it turns out that there's actually no longer any reason to set the current task / current regs while backtracing anymore anyway. As of commit 2277b492582d ("kdb: Fix stack crawling on 'running' CPUs that aren't the master") if we're backtracing on a task running on a CPU we ask that CPU to do the backtrace itself. Linux can do that without anything fancy. If we're doing backtrace on a sleeping task we can also do that fine without updating globals. So this patch mostly just turns into deleting a bunch of code. [1] https://lore.kernel.org/r/20191010150735.dhrj3pbjgmjrdpwr@holly.lan Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20191109111624.4.Ibc3d982bbeb9e46872d43973ba808cd4c79537c7@changeid Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_bt.c | 8 +------- kernel/debug/kdb/kdb_main.c | 2 +- kernel/debug/kdb/kdb_private.h | 1 - 3 files changed, 2 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 4af48ac53625..3de0cc780c16 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -119,7 +119,6 @@ kdb_bt_cpu(unsigned long cpu) return; } - kdb_set_current_task(kdb_tsk); kdb_bt1(kdb_tsk, ~0UL, false); } @@ -166,10 +165,8 @@ kdb_bt(int argc, const char **argv) if (diag) return diag; p = find_task_by_pid_ns(pid, &init_pid_ns); - if (p) { - kdb_set_current_task(p); + if (p) return kdb_bt1(p, ~0UL, false); - } kdb_printf("No process with pid == %ld found\n", pid); return 0; } else if (strcmp(argv[0], "btt") == 0) { @@ -178,11 +175,9 @@ kdb_bt(int argc, const char **argv) diag = kdbgetularg((char *)argv[1], &addr); if (diag) return diag; - kdb_set_current_task((struct task_struct *)addr); return kdb_bt1((struct task_struct *)addr, ~0UL, false); } else if (strcmp(argv[0], "btc") == 0) { unsigned long cpu = ~0; - struct task_struct *save_current_task = kdb_current_task; if (argc > 1) return KDB_ARGCOUNT; if (argc == 1) { @@ -204,7 +199,6 @@ kdb_bt(int argc, const char **argv) kdb_bt_cpu(cpu); touch_nmi_watchdog(); } - kdb_set_current_task(save_current_task); } return 0; } else { diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 4d44b3746836..ba12e9f4661e 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1138,7 +1138,7 @@ static void kdb_dumpregs(struct pt_regs *regs) console_loglevel = old_lvl; } -void kdb_set_current_task(struct task_struct *p) +static void kdb_set_current_task(struct task_struct *p) { kdb_current_task = p; diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index e829b22f3946..2e296e4a234c 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -240,7 +240,6 @@ extern void *debug_kmalloc(size_t size, gfp_t flags); extern void debug_kfree(void *); extern void debug_kusage(void); -extern void kdb_set_current_task(struct task_struct *); extern struct task_struct *kdb_current_task; extern struct pt_regs *kdb_current_regs; -- cgit v1.2.3 From bbfceba15f8d1260c328a254efc2b3f2deae4904 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Sat, 9 Nov 2019 11:16:44 -0800 Subject: kdb: Get rid of confusing diag msg from "rd" if current task has no regs If you switch to a sleeping task with the "pid" command and then type "rd", kdb tells you this: No current kdb registers. You may need to select another task diag: -17: Invalid register name The first message makes sense, but not the second. Fix it by just returning 0 after commands accessing the current registers finish if we've already printed the "No current kdb registers" error. While fixing kdb_rd(), change the function to use "if" rather than "ifdef". It cleans the function up a bit and any modern compiler will have no trouble handling still producing good code. Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20191109111624.5.I121f4c6f0c19266200bf6ef003de78841e5bfc3d@changeid Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index ba12e9f4661e..b22292b649c4 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -543,9 +543,8 @@ int kdbgetaddrarg(int argc, const char **argv, int *nextarg, if (diag) return diag; } else if (symname[0] == '%') { - diag = kdb_check_regs(); - if (diag) - return diag; + if (kdb_check_regs()) + return 0; /* Implement register values with % at a later time as it is * arch optional. */ @@ -1836,8 +1835,7 @@ static int kdb_go(int argc, const char **argv) */ static int kdb_rd(int argc, const char **argv) { - int len = kdb_check_regs(); -#if DBG_MAX_REG_NUM > 0 + int len = 0; int i; char *rname; int rsize; @@ -1846,8 +1844,14 @@ static int kdb_rd(int argc, const char **argv) u16 reg16; u8 reg8; - if (len) - return len; + if (kdb_check_regs()) + return 0; + + /* Fallback to Linux showregs() if we don't have DBG_MAX_REG_NUM */ + if (DBG_MAX_REG_NUM <= 0) { + kdb_dumpregs(kdb_current_regs); + return 0; + } for (i = 0; i < DBG_MAX_REG_NUM; i++) { rsize = dbg_reg_def[i].size * 2; @@ -1889,12 +1893,7 @@ static int kdb_rd(int argc, const char **argv) } } kdb_printf("\n"); -#else - if (len) - return len; - kdb_dumpregs(kdb_current_regs); -#endif return 0; } @@ -1928,9 +1927,8 @@ static int kdb_rm(int argc, const char **argv) if (diag) return diag; - diag = kdb_check_regs(); - if (diag) - return diag; + if (kdb_check_regs()) + return 0; diag = KDB_BADREG; for (i = 0; i < DBG_MAX_REG_NUM; i++) { -- cgit v1.2.3 From a4f8a7fb1963bc02fbd40a0a28e128bb56d2fcc9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 28 Nov 2019 13:07:53 +0000 Subject: kdb: remove redundant assignment to pointer bp The point bp is assigned a value that is never read, it is being re-assigned later to bp = &kdb_breakpoints[lowbp] in a for-loop. Remove the redundant assignment. Addresses-Coverity ("Unused value") Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20191128130753.181246-1-colin.king@canonical.com Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_bp.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 62c301ad0773..d7ebb2c79cb8 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -412,7 +412,6 @@ static int kdb_bc(int argc, const char **argv) * assume that the breakpoint number is desired. */ if (addr < KDB_MAXBPT) { - bp = &kdb_breakpoints[addr]; lowbp = highbp = addr; highbp++; } else { -- cgit v1.2.3 From dc2c733e65848b1df8d55c83eea79fc4a868c800 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 24 Jan 2020 18:14:40 +0200 Subject: kdb: Use for_each_console() helper Replace open coded single-linked list iteration loop with for_each_console() helper in use. Signed-off-by: Andy Shevchenko Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 8bcdded5d61f..924bc9298a42 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -553,7 +553,7 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) int this_cpu, old_cpu; char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; char *moreprompt = "more> "; - struct console *c = console_drivers; + struct console *c; unsigned long uninitialized_var(flags); /* Serialize kdb_printf if multiple cpus try to write at once. @@ -698,10 +698,9 @@ kdb_printit: cp2++; } } - while (c) { + for_each_console(c) { c->write(c, cp, retlen - (cp - kdb_buffer)); touch_nmi_watchdog(); - c = c->next; } } if (logging) { @@ -752,7 +751,6 @@ kdb_printit: moreprompt = "more> "; kdb_input_flush(); - c = console_drivers; if (dbg_io_ops && !dbg_io_ops->is_console) { len = strlen(moreprompt); @@ -762,10 +760,9 @@ kdb_printit: cp++; } } - while (c) { + for_each_console(c) { c->write(c, moreprompt, strlen(moreprompt)); touch_nmi_watchdog(); - c = c->next; } if (logging) -- cgit v1.2.3 From fdeb1aca2861472b38779be44141757483300827 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 29 Jan 2020 12:59:26 -0600 Subject: tracing: Change trace_boot to use synth_event interface Have trace_boot_add_synth_event() use the synth_event interface. Also, rename synth_event_run_cmd() to synth_event_run_command() now that trace_boot's version is gone. Link: http://lkml.kernel.org/r/94f1fa0e31846d0bddca916b8663404b20559e34.1580323897.git.zanussi@kernel.org Acked-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 31 ++++++++++++------------------- kernel/trace/trace_events_hist.c | 9 ++------- 2 files changed, 14 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 2298a70cdda6..06d7feb5255f 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -125,38 +125,31 @@ trace_boot_add_kprobe_event(struct xbc_node *node, const char *event) #endif #ifdef CONFIG_HIST_TRIGGERS -extern int synth_event_run_command(const char *command); - static int __init trace_boot_add_synth_event(struct xbc_node *node, const char *event) { + struct dynevent_cmd cmd; struct xbc_node *anode; - char buf[MAX_BUF_LEN], *q; + char buf[MAX_BUF_LEN]; const char *p; - int len, delta, ret; + int ret; - len = ARRAY_SIZE(buf); - delta = snprintf(buf, len, "%s", event); - if (delta >= len) { - pr_err("Event name is too long: %s\n", event); - return -E2BIG; - } - len -= delta; q = buf + delta; + synth_event_cmd_init(&cmd, buf, MAX_BUF_LEN); + + ret = synth_event_gen_cmd_start(&cmd, event, NULL); + if (ret) + return ret; xbc_node_for_each_array_value(node, "fields", anode, p) { - delta = snprintf(q, len, " %s;", p); - if (delta >= len) { - pr_err("fields string is too long: %s\n", p); - return -E2BIG; - } - len -= delta; q += delta; + ret = synth_event_add_field_str(&cmd, p); + if (ret) + return ret; } - ret = synth_event_run_command(buf); + ret = synth_event_gen_cmd_end(&cmd); if (ret < 0) pr_err("Failed to add synthetic event: %s\n", buf); - return ret; } #else diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 4d56a4f0310d..2e88c9805f4b 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1755,12 +1755,7 @@ static int create_or_delete_synth_event(int argc, char **argv) return ret == -ECANCELED ? -EINVAL : ret; } -int synth_event_run_command(const char *command) -{ - return trace_run_command(command, create_or_delete_synth_event); -} - -static int synth_event_run_cmd(struct dynevent_cmd *cmd) +static int synth_event_run_command(struct dynevent_cmd *cmd) { struct synth_event *se; int ret; @@ -1790,7 +1785,7 @@ static int synth_event_run_cmd(struct dynevent_cmd *cmd) void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) { dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH, - synth_event_run_cmd); + synth_event_run_command); } EXPORT_SYMBOL_GPL(synth_event_cmd_init); -- cgit v1.2.3 From d380dcde9a07ca5de4805dee11f58a98ec0ad6ff Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 29 Jan 2020 21:18:18 -0500 Subject: tracing: Fix now invalid var_ref_vals assumption in trace action The patch 'tracing: Fix histogram code when expression has same var as value' added code to return an existing variable reference when creating a new variable reference, which resulted in var_ref_vals slots being reused instead of being duplicated. The implementation of the trace action assumes that the end of the var_ref_vals array starting at action_data.var_ref_idx corresponds to the values that will be assigned to the trace params. The patch mentioned above invalidates that assumption, which means that each param needs to explicitly specify its index into var_ref_vals. This fix changes action_data.var_ref_idx to an array of var ref indexes to account for that. Link: https://lore.kernel.org/r/1580335695.6220.8.camel@kernel.org Fixes: 8bcebc77e85f ("tracing: Fix histogram code when expression has same var as value") Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 53 ++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 2e88c9805f4b..5b4e04780411 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -476,11 +476,12 @@ struct action_data { * When a histogram trigger is hit, the values of any * references to variables, including variables being passed * as parameters to synthetic events, are collected into a - * var_ref_vals array. This var_ref_idx is the index of the - * first param in the array to be passed to the synthetic - * event invocation. + * var_ref_vals array. This var_ref_idx array is an array of + * indices into the var_ref_vals array, one for each synthetic + * event param, and is passed to the synthetic event + * invocation. */ - unsigned int var_ref_idx; + unsigned int var_ref_idx[TRACING_MAP_VARS_MAX]; struct synth_event *synth_event; bool use_trace_keyword; char *synth_event_name; @@ -884,14 +885,14 @@ static struct trace_event_functions synth_event_funcs = { static notrace void trace_event_raw_event_synth(void *__data, u64 *var_ref_vals, - unsigned int var_ref_idx) + unsigned int *var_ref_idx) { struct trace_event_file *trace_file = __data; struct synth_trace_event *entry; struct trace_event_buffer fbuffer; struct trace_buffer *buffer; struct synth_event *event; - unsigned int i, n_u64; + unsigned int i, n_u64, val_idx; int fields_size = 0; event = trace_file->event_call->data; @@ -914,15 +915,16 @@ static notrace void trace_event_raw_event_synth(void *__data, goto out; for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + val_idx = var_ref_idx[i]; if (event->fields[i]->is_string) { - char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i]; + char *str_val = (char *)(long)var_ref_vals[val_idx]; char *str_field = (char *)&entry->fields[n_u64]; strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { struct synth_field *field = event->fields[i]; - u64 val = var_ref_vals[var_ref_idx + i]; + u64 val = var_ref_vals[val_idx]; switch (field->size) { case 1: @@ -1122,10 +1124,10 @@ static struct tracepoint *alloc_synth_tracepoint(char *name) } typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals, - unsigned int var_ref_idx); + unsigned int *var_ref_idx); static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, - unsigned int var_ref_idx) + unsigned int *var_ref_idx) { struct tracepoint *tp = event->tp; @@ -3506,6 +3508,22 @@ static int init_var_ref(struct hist_field *ref_field, goto out; } +static int find_var_ref_idx(struct hist_trigger_data *hist_data, + struct hist_field *var_field) +{ + struct hist_field *ref_field; + int i; + + for (i = 0; i < hist_data->n_var_refs; i++) { + ref_field = hist_data->var_refs[i]; + if (ref_field->var.idx == var_field->var.idx && + ref_field->var.hist_data == var_field->hist_data) + return i; + } + + return -ENOENT; +} + /** * create_var_ref - Create a variable reference and attach it to trigger * @hist_data: The trigger that will be referencing the variable @@ -5071,11 +5089,11 @@ static int trace_action_create(struct hist_trigger_data *hist_data, struct trace_array *tr = hist_data->event_file->tr; char *event_name, *param, *system = NULL; struct hist_field *hist_field, *var_ref; - unsigned int i, var_ref_idx; + unsigned int i; unsigned int field_pos = 0; struct synth_event *event; char *synth_event_name; - int ret = 0; + int var_ref_idx, ret = 0; lockdep_assert_held(&event_mutex); @@ -5092,8 +5110,6 @@ static int trace_action_create(struct hist_trigger_data *hist_data, event->ref++; - var_ref_idx = hist_data->n_var_refs; - for (i = 0; i < data->n_params; i++) { char *p; @@ -5142,6 +5158,14 @@ static int trace_action_create(struct hist_trigger_data *hist_data, goto err; } + var_ref_idx = find_var_ref_idx(hist_data, var_ref); + if (WARN_ON(var_ref_idx < 0)) { + ret = var_ref_idx; + goto err; + } + + data->var_ref_idx[i] = var_ref_idx; + field_pos++; kfree(p); continue; @@ -5160,7 +5184,6 @@ static int trace_action_create(struct hist_trigger_data *hist_data, } data->synth_event = event; - data->var_ref_idx = var_ref_idx; out: return ret; err: -- cgit v1.2.3 From 43e76af85fa7e75ac9b71fc2fcc250abb1889bff Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 30 Jan 2020 22:17:35 -0800 Subject: kcov: ignore fault-inject and stacktrace Don't instrument 3 more files that contain debugging facilities and produce large amounts of uninteresting coverage for every syscall. The following snippets are sprinkled all over the place in kcov traces in a debugging kernel. We already try to disable instrumentation of stack unwinding code and of most debug facilities. I guess we did not use fault-inject.c at the time, and stacktrace.c was somehow missed (or something has changed in kernel/configs). This change both speeds up kcov (kernel doesn't need to store these PCs, user-space doesn't need to process them) and frees trace buffer capacity for more useful coverage. should_fail lib/fault-inject.c:149 fail_dump lib/fault-inject.c:45 stack_trace_save kernel/stacktrace.c:124 stack_trace_consume_entry kernel/stacktrace.c:86 stack_trace_consume_entry kernel/stacktrace.c:89 ... a hundred frames skipped ... stack_trace_consume_entry kernel/stacktrace.c:93 stack_trace_consume_entry kernel/stacktrace.c:86 Link: http://lkml.kernel.org/r/20200116111449.217744-1-dvyukov@gmail.com Signed-off-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index f2cc0d118a0b..4cb4130ced32 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -27,6 +27,7 @@ KCOV_INSTRUMENT_softirq.o := n # and produce insane amounts of uninteresting coverage. KCOV_INSTRUMENT_module.o := n KCOV_INSTRUMENT_extable.o := n +KCOV_INSTRUMENT_stacktrace.o := n # Don't self-instrument. KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n -- cgit v1.2.3 From 249d7b2ef674cdae28c377cfe6f56696548305d5 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 31 Jan 2020 15:55:31 -0600 Subject: tracing: Consolidate some synth_event_trace code The synth_event trace code contains some almost identical functions and some small functions that are called only once - consolidate the common code into single functions and fold in the small functions to simplify the code overall. Link: http://lkml.kernel.org/r/d1c8d8ad124a653b7543afe801d38c199ca5c20e.1580506712.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 141 ++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5b4e04780411..42058a1b5146 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2053,24 +2053,72 @@ out: } EXPORT_SYMBOL_GPL(synth_event_trace_start); -static int save_synth_val(struct synth_field *field, u64 val, - struct synth_event_trace_state *trace_state) +static int __synth_event_add_val(const char *field_name, u64 val, + struct synth_event_trace_state *trace_state) { - struct synth_trace_event *entry = trace_state->entry; + struct synth_field *field = NULL; + struct synth_trace_event *entry; + struct synth_event *event; + int i, ret = 0; + + if (!trace_state) { + ret = -EINVAL; + goto out; + } + + /* can't mix add_next_synth_val() with add_synth_val() */ + if (field_name) { + if (trace_state->add_next) { + ret = -EINVAL; + goto out; + } + trace_state->add_name = true; + } else { + if (trace_state->add_name) { + ret = -EINVAL; + goto out; + } + trace_state->add_next = true; + } + + if (!trace_state->enabled) + goto out; + + event = trace_state->event; + if (trace_state->add_name) { + for (i = 0; i < event->n_fields; i++) { + field = event->fields[i]; + if (strcmp(field->name, field_name) == 0) + break; + } + if (!field) { + ret = -EINVAL; + goto out; + } + } else { + if (trace_state->cur_field >= event->n_fields) { + ret = -EINVAL; + goto out; + } + field = event->fields[trace_state->cur_field++]; + } + entry = trace_state->entry; if (field->is_string) { char *str_val = (char *)(long)val; char *str_field; - if (!str_val) - return -EINVAL; + if (!str_val) { + ret = -EINVAL; + goto out; + } str_field = (char *)&entry->fields[field->offset]; strscpy(str_field, str_val, STR_VAR_LEN_MAX); } else entry->fields[field->offset] = val; - - return 0; + out: + return ret; } /** @@ -2104,54 +2152,10 @@ static int save_synth_val(struct synth_field *field, u64 val, int synth_event_add_next_val(u64 val, struct synth_event_trace_state *trace_state) { - struct synth_field *field; - struct synth_event *event; - int ret = 0; - - if (!trace_state) { - ret = -EINVAL; - goto out; - } - - /* can't mix add_next_synth_val() with add_synth_val() */ - if (trace_state->add_name) { - ret = -EINVAL; - goto out; - } - trace_state->add_next = true; - - if (!trace_state->enabled) - goto out; - - event = trace_state->event; - - if (trace_state->cur_field >= event->n_fields) { - ret = -EINVAL; - goto out; - } - - field = event->fields[trace_state->cur_field++]; - ret = save_synth_val(field, val, trace_state); - out: - return ret; + return __synth_event_add_val(NULL, val, trace_state); } EXPORT_SYMBOL_GPL(synth_event_add_next_val); -static struct synth_field *find_synth_field(struct synth_event *event, - const char *field_name) -{ - struct synth_field *field = NULL; - unsigned int i; - - for (i = 0; i < event->n_fields; i++) { - field = event->fields[i]; - if (strcmp(field->name, field_name) == 0) - return field; - } - - return NULL; -} - /** * synth_event_add_val - Add a named field's value to an open synth trace * @field_name: The name of the synthetic event field value to set @@ -2183,38 +2187,7 @@ static struct synth_field *find_synth_field(struct synth_event *event, int synth_event_add_val(const char *field_name, u64 val, struct synth_event_trace_state *trace_state) { - struct synth_trace_event *entry; - struct synth_event *event; - struct synth_field *field; - int ret = 0; - - if (!trace_state) { - ret = -EINVAL; - goto out; - } - - /* can't mix add_next_synth_val() with add_synth_val() */ - if (trace_state->add_next) { - ret = -EINVAL; - goto out; - } - trace_state->add_name = true; - - if (!trace_state->enabled) - goto out; - - event = trace_state->event; - entry = trace_state->entry; - - field = find_synth_field(event, field_name); - if (!field) { - ret = -EINVAL; - goto out; - } - - ret = save_synth_val(field, val, trace_state); - out: - return ret; + return __synth_event_add_val(field_name, val, trace_state); } EXPORT_SYMBOL_GPL(synth_event_add_val); -- cgit v1.2.3 From 6f1a4891a5928a5969c87fa5a584844c983ec823 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 31 Jan 2020 15:26:52 +0100 Subject: x86/apic/msi: Plug non-maskable MSI affinity race Evan tracked down a subtle race between the update of the MSI message and the device raising an interrupt internally on PCI devices which do not support MSI masking. The update of the MSI message is non-atomic and consists of either 2 or 3 sequential 32bit wide writes to the PCI config space. - Write address low 32bits - Write address high 32bits (If supported by device) - Write data When an interrupt is migrated then both address and data might change, so the kernel attempts to mask the MSI interrupt first. But for MSI masking is optional, so there exist devices which do not provide it. That means that if the device raises an interrupt internally between the writes then a MSI message is sent built from half updated state. On x86 this can lead to spurious interrupts on the wrong interrupt vector when the affinity setting changes both address and data. As a consequence the device interrupt can be lost causing the device to become stuck or malfunctioning. Evan tried to handle that by disabling MSI accross an MSI message update. That's not feasible because disabling MSI has issues on its own: If MSI is disabled the PCI device is routing an interrupt to the legacy INTx mechanism. The INTx delivery can be disabled, but the disablement is not working on all devices. Some devices lose interrupts when both MSI and INTx delivery are disabled. Another way to solve this would be to enforce the allocation of the same vector on all CPUs in the system for this kind of screwed devices. That could be done, but it would bring back the vector space exhaustion problems which got solved a few years ago. Fortunately the high address (if supported by the device) is only relevant when X2APIC is enabled which implies interrupt remapping. In the interrupt remapping case the affinity setting is happening at the interrupt remapping unit and the PCI MSI message is programmed only once when the PCI device is initialized. That makes it possible to solve it with a two step update: 1) Target the MSI msg to the new vector on the current target CPU 2) Target the MSI msg to the new vector on the new target CPU In both cases writing the MSI message is only changing a single 32bit word which prevents the issue of inconsistency. After writing the final destination it is necessary to check whether the device issued an interrupt while the intermediate state #1 (new vector, current CPU) was in effect. This is possible because the affinity change is always happening on the current target CPU. The code runs with interrupts disabled, so the interrupt can be detected by checking the IRR of the local APIC. If the vector is pending in the IRR then the interrupt is retriggered on the new target CPU by sending an IPI for the associated vector on the target CPU. This can cause spurious interrupts on both the local and the new target CPU. 1) If the new vector is not in use on the local CPU and the device affected by the affinity change raised an interrupt during the transitional state (step #1 above) then interrupt entry code will ignore that spurious interrupt. The vector is marked so that the 'No irq handler for vector' warning is supressed once. 2) If the new vector is in use already on the local CPU then the IRR check might see an pending interrupt from the device which is using this vector. The IPI to the new target CPU will then invoke the handler of the device, which got the affinity change, even if that device did not issue an interrupt 3) If the new vector is in use already on the local CPU and the device affected by the affinity change raised an interrupt during the transitional state (step #1 above) then the handler of the device which uses that vector on the local CPU will be invoked. expose issues in device driver interrupt handlers which are not prepared to handle a spurious interrupt correctly. This not a regression, it's just exposing something which was already broken as spurious interrupts can happen for a lot of reasons and all driver handlers need to be able to deal with them. Reported-by: Evan Green Debugged-by: Evan Green Signed-off-by: Thomas Gleixner Tested-by: Evan Green Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/87imkr4s7n.fsf@nanos.tec.linutronix.de --- kernel/irq/debugfs.c | 1 + kernel/irq/msi.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index c1eccd4f6520..a949bd39e343 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -114,6 +114,7 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), BIT_MASK_DESCR(IRQD_CAN_RESERVE), + BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK), BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index ad26fbcfbfc8..eb95f6106a1e 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -453,8 +453,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, continue; irq_data = irq_domain_get_irq_data(domain, desc->irq); - if (!can_reserve) + if (!can_reserve) { irqd_clr_can_reserve(irq_data); + if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK) + irqd_set_msi_nomask_quirk(irq_data); + } ret = irq_domain_activate_irq(irq_data, can_reserve); if (ret) goto cleanup; -- cgit v1.2.3 From febac332a819f0e764aa4da62757ba21d18c182b Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 31 Jan 2020 19:08:59 +0300 Subject: clocksource: Prevent double add_timer_on() for watchdog_timer Kernel crashes inside QEMU/KVM are observed: kernel BUG at kernel/time/timer.c:1154! BUG_ON(timer_pending(timer) || !timer->function) in add_timer_on(). At the same time another cpu got: general protection fault: 0000 [#1] SMP PTI of poinson pointer 0xdead000000000200 in: __hlist_del at include/linux/list.h:681 (inlined by) detach_timer at kernel/time/timer.c:818 (inlined by) expire_timers at kernel/time/timer.c:1355 (inlined by) __run_timers at kernel/time/timer.c:1686 (inlined by) run_timer_softirq at kernel/time/timer.c:1699 Unfortunately kernel logs are badly scrambled, stacktraces are lost. Printing the timer->function before the BUG_ON() pointed to clocksource_watchdog(). The execution of clocksource_watchdog() can race with a sequence of clocksource_stop_watchdog() .. clocksource_start_watchdog(): expire_timers() detach_timer(timer, true); timer->entry.pprev = NULL; raw_spin_unlock_irq(&base->lock); call_timer_fn clocksource_watchdog() clocksource_watchdog_kthread() or clocksource_unbind() spin_lock_irqsave(&watchdog_lock, flags); clocksource_stop_watchdog(); del_timer(&watchdog_timer); watchdog_running = 0; spin_unlock_irqrestore(&watchdog_lock, flags); spin_lock_irqsave(&watchdog_lock, flags); clocksource_start_watchdog(); add_timer_on(&watchdog_timer, ...); watchdog_running = 1; spin_unlock_irqrestore(&watchdog_lock, flags); spin_lock(&watchdog_lock); add_timer_on(&watchdog_timer, ...); BUG_ON(timer_pending(timer) || !timer->function); timer_pending() -> true BUG() I.e. inside clocksource_watchdog() watchdog_timer could be already armed. Check timer_pending() before calling add_timer_on(). This is sufficient as all operations are synchronized by watchdog_lock. Fixes: 75c5158f70c0 ("timekeeping: Update clocksource with stop_machine") Signed-off-by: Konstantin Khlebnikov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/158048693917.4378.13823603769948933793.stgit@buzz --- kernel/time/clocksource.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index fff5f64981c6..428beb69426a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -293,8 +293,15 @@ static void clocksource_watchdog(struct timer_list *unused) next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); if (next_cpu >= nr_cpu_ids) next_cpu = cpumask_first(cpu_online_mask); - watchdog_timer.expires += WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, next_cpu); + + /* + * Arm timer if not already pending: could race with concurrent + * pair clocksource_stop_watchdog() clocksource_start_watchdog(). + */ + if (!timer_pending(&watchdog_timer)) { + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, next_cpu); + } out: spin_unlock(&watchdog_lock); } -- cgit v1.2.3 From 74403b6c50dd7a633d3f22f59f975d6081eae093 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 31 Jan 2020 15:55:32 -0600 Subject: tracing: Remove check_arg() callbacks from dynevent args It's kind of strange to have check_arg() callbacks as part of the arg objects themselves; it makes more sense to just pass these in when the args are added instead. Remove the check_arg() callbacks from those objects which also means removing the check_arg() args from the init functions, adding them to the add functions and fixing up existing callers. Link: http://lkml.kernel.org/r/c7708d6f177fcbe1a36b6e4e8e150907df0fa5d2.1580506712.git.zanussi@kernel.org Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_dynevent.c | 62 ++++++++++++++++++---------------------- kernel/trace/trace_dynevent.h | 11 ++++--- kernel/trace/trace_events_hist.c | 16 +++++------ kernel/trace/trace_kprobe.c | 10 +++---- 4 files changed, 46 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 6ffdbc4fda53..f9cfcdc9d1f3 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -228,27 +228,30 @@ fs_initcall(init_dynamic_event); * dynevent_arg_add - Add an arg to a dynevent_cmd * @cmd: A pointer to the dynevent_cmd struct representing the new event cmd * @arg: The argument to append to the current cmd + * @check_arg: An (optional) pointer to a function checking arg sanity * * Append an argument to a dynevent_cmd. The argument string will be * appended to the current cmd string, followed by a separator, if - * applicable. Before the argument is added, the check_arg() - * function, if defined, is called. + * applicable. Before the argument is added, the @check_arg function, + * if present, will be used to check the sanity of the current arg + * string. * - * The cmd string, separator, and check_arg() function should be set - * using the dynevent_arg_init() before any arguments are added using - * this function. + * The cmd string and separator should be set using the + * dynevent_arg_init() before any arguments are added using this + * function. * * Return: 0 if successful, error otherwise. */ int dynevent_arg_add(struct dynevent_cmd *cmd, - struct dynevent_arg *arg) + struct dynevent_arg *arg, + dynevent_check_arg_fn_t check_arg) { int ret = 0; int delta; char *q; - if (arg->check_arg) { - ret = arg->check_arg(arg); + if (check_arg) { + ret = check_arg(arg); if (ret) return ret; } @@ -269,6 +272,7 @@ int dynevent_arg_add(struct dynevent_cmd *cmd, * dynevent_arg_pair_add - Add an arg pair to a dynevent_cmd * @cmd: A pointer to the dynevent_cmd struct representing the new event cmd * @arg_pair: The argument pair to append to the current cmd + * @check_arg: An (optional) pointer to a function checking arg sanity * * Append an argument pair to a dynevent_cmd. An argument pair * consists of a left-hand-side argument and a right-hand-side @@ -278,24 +282,26 @@ int dynevent_arg_add(struct dynevent_cmd *cmd, * * The lhs argument string will be appended to the current cmd string, * followed by an operator, if applicable, followd by the rhs string, - * followed finally by a separator, if applicable. Before anything is - * added, the check_arg() function, if defined, is called. + * followed finally by a separator, if applicable. Before the + * argument is added, the @check_arg function, if present, will be + * used to check the sanity of the current arg strings. * - * The cmd strings, operator, separator, and check_arg() function - * should be set using the dynevent_arg_pair_init() before any arguments - * are added using this function. + * The cmd strings, operator, and separator should be set using the + * dynevent_arg_pair_init() before any arguments are added using this + * function. * * Return: 0 if successful, error otherwise. */ int dynevent_arg_pair_add(struct dynevent_cmd *cmd, - struct dynevent_arg_pair *arg_pair) + struct dynevent_arg_pair *arg_pair, + dynevent_check_arg_fn_t check_arg) { int ret = 0; int delta; char *q; - if (arg_pair->check_arg) { - ret = arg_pair->check_arg(arg_pair); + if (check_arg) { + ret = check_arg(arg_pair); if (ret) return ret; } @@ -385,20 +391,16 @@ void dynevent_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen, /** * dynevent_arg_init - Initialize a dynevent_arg object * @arg: A pointer to the dynevent_arg struct representing the arg - * @check_arg: An (optional) pointer to a function checking arg sanity * @separator: An (optional) separator, appended after adding the arg * * Initialize a dynevent_arg object. A dynevent_arg represents an * object used to append single arguments to the current command - * string. The @check_arg function, if present, will be used to check - * the sanity of the current arg string (which is directly set by the - * caller). After the arg string is successfully appended to the + * string. After the arg string is successfully appended to the * command string, the optional @separator is appended. If no * separator was specified when initializing the arg, a space will be * appended. */ void dynevent_arg_init(struct dynevent_arg *arg, - dynevent_check_arg_fn_t check_arg, char separator) { memset(arg, '\0', sizeof(*arg)); @@ -406,14 +408,11 @@ void dynevent_arg_init(struct dynevent_arg *arg, if (!separator) separator = ' '; arg->separator = separator; - - arg->check_arg = check_arg; } /** * dynevent_arg_pair_init - Initialize a dynevent_arg_pair object * @arg_pair: A pointer to the dynevent_arg_pair struct representing the arg - * @check_arg: An (optional) pointer to a function checking arg sanity * @operator: An (optional) operator, appended after adding the first arg * @separator: An (optional) separator, appended after adding the second arg * @@ -422,16 +421,13 @@ void dynevent_arg_init(struct dynevent_arg *arg, * variable_name;' or 'x+y' to the current command string. An * argument pair consists of a left-hand-side argument and a * right-hand-side argument separated by an operator, which can be - * whitespace, all followed by a separator, if applicable. The - * @check_arg function, if present, will be used to check the sanity - * of the current arg strings (which is directly set by the caller). - * After the first arg string is successfully appended to the command - * string, the optional @operator is appended, followed by the second - * arg and and optional @separator. If no separator was specified - * when initializing the arg, a space will be appended. + * whitespace, all followed by a separator, if applicable. After the + * first arg string is successfully appended to the command string, + * the optional @operator is appended, followed by the second arg and + * and optional @separator. If no separator was specified when + * initializing the arg, a space will be appended. */ void dynevent_arg_pair_init(struct dynevent_arg_pair *arg_pair, - dynevent_check_arg_fn_t check_arg, char operator, char separator) { memset(arg_pair, '\0', sizeof(*arg_pair)); @@ -443,8 +439,6 @@ void dynevent_arg_pair_init(struct dynevent_arg_pair *arg_pair, if (!separator) separator = ' '; arg_pair->separator = separator; - - arg_pair->check_arg = check_arg; } /** diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index b593fc34c5b1..d6857a254ede 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -126,28 +126,27 @@ typedef int (*dynevent_check_arg_fn_t)(void *data); struct dynevent_arg { const char *str; char separator; /* e.g. ';', ',', or nothing */ - dynevent_check_arg_fn_t check_arg; }; extern void dynevent_arg_init(struct dynevent_arg *arg, - dynevent_check_arg_fn_t check_arg, char separator); extern int dynevent_arg_add(struct dynevent_cmd *cmd, - struct dynevent_arg *arg); + struct dynevent_arg *arg, + dynevent_check_arg_fn_t check_arg); struct dynevent_arg_pair { const char *lhs; const char *rhs; char operator; /* e.g. '=' or nothing */ char separator; /* e.g. ';', ',', or nothing */ - dynevent_check_arg_fn_t check_arg; }; extern void dynevent_arg_pair_init(struct dynevent_arg_pair *arg_pair, - dynevent_check_arg_fn_t check_arg, char operator, char separator); + extern int dynevent_arg_pair_add(struct dynevent_cmd *cmd, - struct dynevent_arg_pair *arg_pair); + struct dynevent_arg_pair *arg_pair, + dynevent_check_arg_fn_t check_arg); extern int dynevent_str_add(struct dynevent_cmd *cmd, const char *str); #endif diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 42058a1b5146..d2817fe52f32 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1334,12 +1334,12 @@ int synth_event_add_field(struct dynevent_cmd *cmd, const char *type, if (!type || !name) return -EINVAL; - dynevent_arg_pair_init(&arg_pair, synth_event_check_arg_fn, 0, ';'); + dynevent_arg_pair_init(&arg_pair, 0, ';'); arg_pair.lhs = type; arg_pair.rhs = name; - ret = dynevent_arg_pair_add(cmd, &arg_pair); + ret = dynevent_arg_pair_add(cmd, &arg_pair, synth_event_check_arg_fn); if (ret) return ret; @@ -1377,11 +1377,11 @@ int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name) if (!type_name) return -EINVAL; - dynevent_arg_init(&arg, NULL, ';'); + dynevent_arg_init(&arg, ';'); arg.str = type_name; - ret = dynevent_arg_add(cmd, &arg); + ret = dynevent_arg_add(cmd, &arg, NULL); if (ret) return ret; @@ -1472,9 +1472,9 @@ int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name, if (cmd->type != DYNEVENT_TYPE_SYNTH) return -EINVAL; - dynevent_arg_init(&arg, NULL, 0); + dynevent_arg_init(&arg, 0); arg.str = name; - ret = dynevent_arg_add(cmd, &arg); + ret = dynevent_arg_add(cmd, &arg, NULL); if (ret) return ret; @@ -1546,9 +1546,9 @@ int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, if (n_fields > SYNTH_FIELDS_MAX) return -EINVAL; - dynevent_arg_init(&arg, NULL, 0); + dynevent_arg_init(&arg, 0); arg.str = name; - ret = dynevent_arg_add(cmd, &arg); + ret = dynevent_arg_add(cmd, &arg, NULL); if (ret) return ret; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 307abb724a71..fe183d4045d2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -962,9 +962,9 @@ int __kprobe_event_gen_cmd_start(struct dynevent_cmd *cmd, bool kretprobe, if (ret) return ret; - dynevent_arg_init(&arg, NULL, 0); + dynevent_arg_init(&arg, 0); arg.str = loc; - ret = dynevent_arg_add(cmd, &arg); + ret = dynevent_arg_add(cmd, &arg, NULL); if (ret) return ret; @@ -982,7 +982,7 @@ int __kprobe_event_gen_cmd_start(struct dynevent_cmd *cmd, bool kretprobe, } arg.str = field; - ret = dynevent_arg_add(cmd, &arg); + ret = dynevent_arg_add(cmd, &arg, NULL); if (ret) break; } @@ -1017,7 +1017,7 @@ int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...) if (cmd->type != DYNEVENT_TYPE_KPROBE) return -EINVAL; - dynevent_arg_init(&arg, NULL, 0); + dynevent_arg_init(&arg, 0); va_start(args, cmd); for (;;) { @@ -1033,7 +1033,7 @@ int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...) } arg.str = field; - ret = dynevent_arg_add(cmd, &arg); + ret = dynevent_arg_add(cmd, &arg, NULL); if (ret) break; } -- cgit v1.2.3 From e9260f6257efa6a6293507696e875b6494ee3744 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 31 Jan 2020 15:55:33 -0600 Subject: tracing: Remove useless code in dynevent_arg_pair_add() The final addition to q is unnecessary, since q isn't ever used afterwards. Link: http://lkml.kernel.org/r/7880a1268217886cdba7035526650195668da856.1580506712.git.zanussi@kernel.org Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_dynevent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index f9cfcdc9d1f3..204275ec8d71 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -322,7 +322,7 @@ int dynevent_arg_pair_add(struct dynevent_cmd *cmd, pr_err("field string is too long: %s\n", arg_pair->rhs); return -E2BIG; } - cmd->remaining -= delta; q += delta; + cmd->remaining -= delta; return ret; } -- cgit v1.2.3 From 2b90927c77c973771cc658d639724d5b247a83eb Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 31 Jan 2020 15:55:34 -0600 Subject: tracing: Use seq_buf for building dynevent_cmd string The dynevent_cmd commands that build up the command string don't need to do that themselves - there's a seq_buf facility that does pretty much the same thing those command are doing manually, so use it instead. Link: http://lkml.kernel.org/r/eb8a6e835c964d0ab8a38cbf5ffa60746b54a465.1580506712.git.zanussi@kernel.org Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_dynevent.c | 48 +++++++++++----------------------------- kernel/trace/trace_events_hist.c | 2 +- kernel/trace/trace_kprobe.c | 2 +- 3 files changed, 15 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 204275ec8d71..9f2e8520b748 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -247,8 +247,6 @@ int dynevent_arg_add(struct dynevent_cmd *cmd, dynevent_check_arg_fn_t check_arg) { int ret = 0; - int delta; - char *q; if (check_arg) { ret = check_arg(arg); @@ -256,14 +254,11 @@ int dynevent_arg_add(struct dynevent_cmd *cmd, return ret; } - q = cmd->buf + (cmd->maxlen - cmd->remaining); - - delta = snprintf(q, cmd->remaining, " %s%c", arg->str, arg->separator); - if (delta >= cmd->remaining) { - pr_err("String is too long: %s\n", arg->str); + ret = seq_buf_printf(&cmd->seq, " %s%c", arg->str, arg->separator); + if (ret) { + pr_err("String is too long: %s%c\n", arg->str, arg->separator); return -E2BIG; } - cmd->remaining -= delta; return ret; } @@ -297,8 +292,6 @@ int dynevent_arg_pair_add(struct dynevent_cmd *cmd, dynevent_check_arg_fn_t check_arg) { int ret = 0; - int delta; - char *q; if (check_arg) { ret = check_arg(arg_pair); @@ -306,23 +299,15 @@ int dynevent_arg_pair_add(struct dynevent_cmd *cmd, return ret; } - q = cmd->buf + (cmd->maxlen - cmd->remaining); - - delta = snprintf(q, cmd->remaining, " %s%c", arg_pair->lhs, - arg_pair->operator); - if (delta >= cmd->remaining) { - pr_err("field string is too long: %s\n", arg_pair->lhs); - return -E2BIG; - } - cmd->remaining -= delta; q += delta; - - delta = snprintf(q, cmd->remaining, "%s%c", arg_pair->rhs, - arg_pair->separator); - if (delta >= cmd->remaining) { - pr_err("field string is too long: %s\n", arg_pair->rhs); + ret = seq_buf_printf(&cmd->seq, " %s%c%s%c", arg_pair->lhs, + arg_pair->operator, arg_pair->rhs, + arg_pair->separator); + if (ret) { + pr_err("field string is too long: %s%c%s%c\n", arg_pair->lhs, + arg_pair->operator, arg_pair->rhs, + arg_pair->separator); return -E2BIG; } - cmd->remaining -= delta; return ret; } @@ -340,17 +325,12 @@ int dynevent_arg_pair_add(struct dynevent_cmd *cmd, int dynevent_str_add(struct dynevent_cmd *cmd, const char *str) { int ret = 0; - int delta; - char *q; - - q = cmd->buf + (cmd->maxlen - cmd->remaining); - delta = snprintf(q, cmd->remaining, "%s", str); - if (delta >= cmd->remaining) { + ret = seq_buf_puts(&cmd->seq, str); + if (ret) { pr_err("String is too long: %s\n", str); return -E2BIG; } - cmd->remaining -= delta; return ret; } @@ -381,9 +361,7 @@ void dynevent_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen, { memset(cmd, '\0', sizeof(*cmd)); - cmd->buf = buf; - cmd->maxlen = maxlen; - cmd->remaining = cmd->maxlen; + seq_buf_init(&cmd->seq, buf, maxlen); cmd->type = type; cmd->run_command = run_command; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index d2817fe52f32..b3bcfd8c7332 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1762,7 +1762,7 @@ static int synth_event_run_command(struct dynevent_cmd *cmd) struct synth_event *se; int ret; - ret = trace_run_command(cmd->buf, create_or_delete_synth_event); + ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event); if (ret) return ret; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index fe183d4045d2..51efc790aea8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -903,7 +903,7 @@ static int create_or_delete_trace_kprobe(int argc, char **argv) static int trace_kprobe_run_command(struct dynevent_cmd *cmd) { - return trace_run_command(cmd->buf, create_or_delete_trace_kprobe); + return trace_run_command(cmd->seq.buffer, create_or_delete_trace_kprobe); } /** -- cgit v1.2.3 From 257af63d7f84f0672aa6a24b5511871f00741f19 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 31 Jan 2020 16:03:14 -0800 Subject: bpf: Fix modifier skipping logic Fix the way modifiers are skipped while walking pointers. Otherwise second level dereferences of 'const struct foo *' will be rejected by the verifier. Fixes: 9e15db66136a ("bpf: Implement accurate raw_tp context access via BTF") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200201000314.261392-1-ast@kernel.org --- kernel/bpf/btf.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8c9d8f266bef..805c43b083e9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3931,6 +3931,7 @@ again: if (btf_type_is_ptr(mtype)) { const struct btf_type *stype; + u32 id; if (msize != size || off != moff) { bpf_log(log, @@ -3939,12 +3940,9 @@ again: return -EACCES; } - stype = btf_type_by_id(btf_vmlinux, mtype->type); - /* skip modifiers */ - while (btf_type_is_modifier(stype)) - stype = btf_type_by_id(btf_vmlinux, stype->type); + stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id); if (btf_type_is_struct(stype)) { - *next_btf_id = mtype->type; + *next_btf_id = id; return PTR_TO_BTF_ID; } } -- cgit v1.2.3 From 97a32539b9568bb653683349e5a76d02ff3c3e2c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 3 Feb 2020 17:37:17 -0800 Subject: proc: convert everything to "struct proc_ops" The most notable change is DEFINE_SHOW_ATTRIBUTE macro split in seq_file.h. Conversion rule is: llseek => proc_lseek unlocked_ioctl => proc_ioctl xxx => proc_xxx delete ".owner = THIS_MODULE" line [akpm@linux-foundation.org: fix drivers/isdn/capi/kcapi_proc.c] [sfr@canb.auug.org.au: fix kernel/sched/psi.c] Link: http://lkml.kernel.org/r/20200122180545.36222f50@canb.auug.org.au Link: http://lkml.kernel.org/r/20191225172546.GB13378@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/configs.c | 9 ++++---- kernel/irq/proc.c | 42 ++++++++++++++++++------------------- kernel/kallsyms.c | 12 +++++------ kernel/latencytop.c | 14 ++++++------- kernel/locking/lockdep_proc.c | 15 +++++++------- kernel/module.c | 12 +++++------ kernel/profile.c | 24 +++++++++++----------- kernel/sched/psi.c | 48 +++++++++++++++++++++---------------------- 8 files changed, 87 insertions(+), 89 deletions(-) (limited to 'kernel') diff --git a/kernel/configs.c b/kernel/configs.c index c09ea4c995e1..a28c79c5f713 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -47,10 +47,9 @@ ikconfig_read_current(struct file *file, char __user *buf, &kernel_config_data); } -static const struct file_operations ikconfig_file_ops = { - .owner = THIS_MODULE, - .read = ikconfig_read_current, - .llseek = default_llseek, +static const struct proc_ops config_gz_proc_ops = { + .proc_read = ikconfig_read_current, + .proc_lseek = default_llseek, }; static int __init ikconfig_init(void) @@ -59,7 +58,7 @@ static int __init ikconfig_init(void) /* create the current config file */ entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL, - &ikconfig_file_ops); + &config_gz_proc_ops); if (!entry) return -ENOMEM; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index cfc4f088a0e7..9e5783d98033 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -176,20 +176,20 @@ static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode)); } -static const struct file_operations irq_affinity_proc_fops = { - .open = irq_affinity_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = irq_affinity_proc_write, +static const struct proc_ops irq_affinity_proc_ops = { + .proc_open = irq_affinity_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = irq_affinity_proc_write, }; -static const struct file_operations irq_affinity_list_proc_fops = { - .open = irq_affinity_list_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = irq_affinity_list_proc_write, +static const struct proc_ops irq_affinity_list_proc_ops = { + .proc_open = irq_affinity_list_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = irq_affinity_list_proc_write, }; #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK @@ -246,12 +246,12 @@ static int default_affinity_open(struct inode *inode, struct file *file) return single_open(file, default_affinity_show, PDE_DATA(inode)); } -static const struct file_operations default_affinity_proc_fops = { - .open = default_affinity_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = default_affinity_write, +static const struct proc_ops default_affinity_proc_ops = { + .proc_open = default_affinity_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = default_affinity_write, }; static int irq_node_proc_show(struct seq_file *m, void *v) @@ -342,7 +342,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) #ifdef CONFIG_SMP /* create /proc/irq//smp_affinity */ proc_create_data("smp_affinity", 0644, desc->dir, - &irq_affinity_proc_fops, irqp); + &irq_affinity_proc_ops, irqp); /* create /proc/irq//affinity_hint */ proc_create_single_data("affinity_hint", 0444, desc->dir, @@ -350,7 +350,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) /* create /proc/irq//smp_affinity_list */ proc_create_data("smp_affinity_list", 0644, desc->dir, - &irq_affinity_list_proc_fops, irqp); + &irq_affinity_list_proc_ops, irqp); proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, irqp); @@ -401,7 +401,7 @@ static void register_default_affinity_proc(void) { #ifdef CONFIG_SMP proc_create("irq/default_smp_affinity", 0644, NULL, - &default_affinity_proc_fops); + &default_affinity_proc_ops); #endif } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 136ce049c4ad..d812b90f4c86 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -698,16 +698,16 @@ const char *kdb_walk_kallsyms(loff_t *pos) } #endif /* CONFIG_KGDB_KDB */ -static const struct file_operations kallsyms_operations = { - .open = kallsyms_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, +static const struct proc_ops kallsyms_proc_ops = { + .proc_open = kallsyms_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release_private, }; static int __init kallsyms_init(void) { - proc_create("kallsyms", 0444, NULL, &kallsyms_operations); + proc_create("kallsyms", 0444, NULL, &kallsyms_proc_ops); return 0; } device_initcall(kallsyms_init); diff --git a/kernel/latencytop.c b/kernel/latencytop.c index e3acead004e6..8d1c15832e55 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -255,17 +255,17 @@ static int lstats_open(struct inode *inode, struct file *filp) return single_open(filp, lstats_show, NULL); } -static const struct file_operations lstats_fops = { - .open = lstats_open, - .read = seq_read, - .write = lstats_write, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops lstats_proc_ops = { + .proc_open = lstats_open, + .proc_read = seq_read, + .proc_write = lstats_write, + .proc_lseek = seq_lseek, + .proc_release = single_release, }; static int __init init_lstats_procfs(void) { - proc_create("latency_stats", 0644, NULL, &lstats_fops); + proc_create("latency_stats", 0644, NULL, &lstats_proc_ops); return 0; } diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 9bb6d2497b04..231684cfc5ae 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -643,12 +643,12 @@ static int lock_stat_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -static const struct file_operations proc_lock_stat_operations = { - .open = lock_stat_open, - .write = lock_stat_write, - .read = seq_read, - .llseek = seq_lseek, - .release = lock_stat_release, +static const struct proc_ops lock_stat_proc_ops = { + .proc_open = lock_stat_open, + .proc_write = lock_stat_write, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = lock_stat_release, }; #endif /* CONFIG_LOCK_STAT */ @@ -660,8 +660,7 @@ static int __init lockdep_proc_init(void) #endif proc_create_single("lockdep_stats", S_IRUSR, NULL, lockdep_stats_show); #ifdef CONFIG_LOCK_STAT - proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, - &proc_lock_stat_operations); + proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, &lock_stat_proc_ops); #endif return 0; diff --git a/kernel/module.c b/kernel/module.c index 90ec5ab60255..33569a01d6e1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4354,16 +4354,16 @@ static int modules_open(struct inode *inode, struct file *file) return err; } -static const struct file_operations proc_modules_operations = { - .open = modules_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, +static const struct proc_ops modules_proc_ops = { + .proc_open = modules_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, }; static int __init proc_modules_init(void) { - proc_create("modules", 0, NULL, &proc_modules_operations); + proc_create("modules", 0, NULL, &modules_proc_ops); return 0; } module_init(proc_modules_init); diff --git a/kernel/profile.c b/kernel/profile.c index 4b144b02ca5d..6f69a4195d56 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -442,18 +442,18 @@ static ssize_t prof_cpu_mask_proc_write(struct file *file, return err; } -static const struct file_operations prof_cpu_mask_proc_fops = { - .open = prof_cpu_mask_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = prof_cpu_mask_proc_write, +static const struct proc_ops prof_cpu_mask_proc_ops = { + .proc_open = prof_cpu_mask_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = prof_cpu_mask_proc_write, }; void create_prof_cpu_mask(void) { /* create /proc/irq/prof_cpu_mask */ - proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_fops); + proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_ops); } /* @@ -517,10 +517,10 @@ static ssize_t write_profile(struct file *file, const char __user *buf, return count; } -static const struct file_operations proc_profile_operations = { - .read = read_profile, - .write = write_profile, - .llseek = default_llseek, +static const struct proc_ops profile_proc_ops = { + .proc_read = read_profile, + .proc_write = write_profile, + .proc_lseek = default_llseek, }; int __ref create_proc_profile(void) @@ -548,7 +548,7 @@ int __ref create_proc_profile(void) err = 0; #endif entry = proc_create("profile", S_IWUSR | S_IRUGO, - NULL, &proc_profile_operations); + NULL, &profile_proc_ops); if (!entry) goto err_state_onl; proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index db7b50bba3f1..ac4bd0ca11cc 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1251,40 +1251,40 @@ static int psi_fop_release(struct inode *inode, struct file *file) return single_release(inode, file); } -static const struct file_operations psi_io_fops = { - .open = psi_io_open, - .read = seq_read, - .llseek = seq_lseek, - .write = psi_io_write, - .poll = psi_fop_poll, - .release = psi_fop_release, +static const struct proc_ops psi_io_proc_ops = { + .proc_open = psi_io_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_io_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, }; -static const struct file_operations psi_memory_fops = { - .open = psi_memory_open, - .read = seq_read, - .llseek = seq_lseek, - .write = psi_memory_write, - .poll = psi_fop_poll, - .release = psi_fop_release, +static const struct proc_ops psi_memory_proc_ops = { + .proc_open = psi_memory_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_memory_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, }; -static const struct file_operations psi_cpu_fops = { - .open = psi_cpu_open, - .read = seq_read, - .llseek = seq_lseek, - .write = psi_cpu_write, - .poll = psi_fop_poll, - .release = psi_fop_release, +static const struct proc_ops psi_cpu_proc_ops = { + .proc_open = psi_cpu_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_cpu_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, }; static int __init psi_proc_init(void) { if (psi_enable) { proc_mkdir("pressure", NULL); - proc_create("pressure/io", 0, NULL, &psi_io_fops); - proc_create("pressure/memory", 0, NULL, &psi_memory_fops); - proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + proc_create("pressure/io", 0, NULL, &psi_io_proc_ops); + proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops); + proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops); } return 0; } -- cgit v1.2.3 From cde26a6e17ec36f6f20102a7e5767c2a0096c95f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 2 Feb 2020 14:09:22 +0900 Subject: kallsyms: fix type of kallsyms_token_table[] kallsyms_token_table[] only contains ASCII characters. It should be char instead of u8. Signed-off-by: Masahiro Yamada Reviewed-by: Masami Hiramatsu --- kernel/kallsyms.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 136ce049c4ad..53f84f685841 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -44,7 +44,7 @@ __attribute__((weak, section(".rodata"))); extern const unsigned long kallsyms_relative_base __attribute__((weak, section(".rodata"))); -extern const u8 kallsyms_token_table[] __weak; +extern const char kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; extern const unsigned int kallsyms_markers[] __weak; @@ -58,7 +58,8 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result, size_t maxlen) { int len, skipped_first = 0; - const u8 *tptr, *data; + const char *tptr; + const u8 *data; /* Get the compressed symbol length from the first symbol byte. */ data = &kallsyms_names[off]; -- cgit v1.2.3 From 91ef26f914171cf753330f13724fd9142b5b1640 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Feb 2020 18:11:10 +0100 Subject: dma-direct: relax addressability checks in dma_direct_supported dma_direct_supported tries to find the minimum addressable bitmask based on the end pfn and optional magic that architectures can use to communicate the size of the magic ZONE_DMA that can be used for bounce buffering. But between the DMA offsets that can change per device (or sometimes even region), the fact the ZONE_DMA isn't even guaranteed to be the lowest addresses and failure of having proper interfaces to the MM code this fails at least for one arm subarchitecture. As all the legacy DMA implementations have supported 32-bit DMA masks, and 32-bit masks are guranteed to always work by the API contract (using bounce buffers if needed), we can short cut the complicated check and always return true without breaking existing assumptions. Hopefully we can properly clean up the interaction with the arch defined zones and the bootmem allocator eventually. Fixes: ad3c7b18c5b3 ("arm: use swiotlb for bounce buffering on LPAE configs") Reported-by: Peter Ujfalusi Signed-off-by: Christoph Hellwig Tested-by: Peter Ujfalusi --- kernel/dma/direct.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 6af7ae83c4ad..32ec69cdba54 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -472,28 +472,26 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, } #endif /* CONFIG_MMU */ -/* - * Because 32-bit DMA masks are so common we expect every architecture to be - * able to satisfy them - either by not supporting more physical memory, or by - * providing a ZONE_DMA32. If neither is the case, the architecture needs to - * use an IOMMU instead of the direct mapping. - */ int dma_direct_supported(struct device *dev, u64 mask) { - u64 min_mask; - - if (IS_ENABLED(CONFIG_ZONE_DMA)) - min_mask = DMA_BIT_MASK(zone_dma_bits); - else - min_mask = DMA_BIT_MASK(32); + u64 min_mask = (max_pfn - 1) << PAGE_SHIFT; - min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT); + /* + * Because 32-bit DMA masks are so common we expect every architecture + * to be able to satisfy them - either by not supporting more physical + * memory, or by providing a ZONE_DMA32. If neither is the case, the + * architecture needs to use an IOMMU instead of the direct mapping. + */ + if (mask >= DMA_BIT_MASK(32)) + return 1; /* * This check needs to be against the actual bit mask value, so * use __phys_to_dma() here so that the SME encryption mask isn't * part of the check. */ + if (IS_ENABLED(CONFIG_ZONE_DMA)) + min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); return mask >= __phys_to_dma(dev, min_mask); } -- cgit v1.2.3 From 4a47cbae04844f0c5e2365aa6c217b61850bb832 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Feb 2020 14:44:38 +0100 Subject: dma-direct: improve swiotlb error reporting Untangle the way how dma_direct_map_page calls into swiotlb to be able to properly report errors where the swiotlb DMA address overflows the mask separately from overflows in the !swiotlb case. This means that siotlb_map now has to do a little more work that duplicates dma_direct_map_page, but doing so greatly simplifies the calling convention. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk --- kernel/dma/direct.c | 16 +++++++--------- kernel/dma/swiotlb.c | 42 +++++++++++++++++++++++------------------- 2 files changed, 30 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 32ec69cdba54..594bddd04e01 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -357,13 +357,6 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, EXPORT_SYMBOL(dma_direct_unmap_sg); #endif -static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr, - size_t size) -{ - return swiotlb_force != SWIOTLB_FORCE && - dma_capable(dev, dma_addr, size, true); -} - dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -371,8 +364,13 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dma_addr = phys_to_dma(dev, phys); - if (unlikely(!dma_direct_possible(dev, dma_addr, size)) && - !swiotlb_map(dev, &phys, &dma_addr, size, dir, attrs)) { + if (unlikely(swiotlb_force == SWIOTLB_FORCE)) + return swiotlb_map(dev, phys, size, dir, attrs); + + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + if (swiotlb_force != SWIOTLB_NO_FORCE) + return swiotlb_map(dev, phys, size, dir, attrs); + report_addr(dev, dma_addr, size); return DMA_MAPPING_ERROR; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9280d6f8271e..c19379fabd20 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -656,35 +657,38 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, } /* - * Create a swiotlb mapping for the buffer at @phys, and in case of DMAing + * Create a swiotlb mapping for the buffer at @paddr, and in case of DMAing * to the device copy the data into it as well. */ -bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) +dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, + enum dma_data_direction dir, unsigned long attrs) { - trace_swiotlb_bounced(dev, *dma_addr, size, swiotlb_force); + phys_addr_t swiotlb_addr; + dma_addr_t dma_addr; - if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) { - dev_warn_ratelimited(dev, - "Cannot do DMA to address %pa\n", phys); - return false; - } + trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, + swiotlb_force); - /* Oh well, have to allocate and map a bounce buffer. */ - *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), - *phys, size, size, dir, attrs); - if (*phys == (phys_addr_t)DMA_MAPPING_ERROR) - return false; + swiotlb_addr = swiotlb_tbl_map_single(dev, + __phys_to_dma(dev, io_tlb_start), + paddr, size, size, dir, attrs); + if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) + return DMA_MAPPING_ERROR; /* Ensure that the address returned is DMA'ble */ - *dma_addr = __phys_to_dma(dev, *phys); - if (unlikely(!dma_capable(dev, *dma_addr, size, true))) { - swiotlb_tbl_unmap_single(dev, *phys, size, size, dir, + dma_addr = __phys_to_dma(dev, swiotlb_addr); + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); - return false; + dev_WARN_ONCE(dev, 1, + "swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + return DMA_MAPPING_ERROR; } - return true; + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arch_sync_dma_for_device(swiotlb_addr, size, dir); + return dma_addr; } size_t swiotlb_max_mapping_size(struct device *dev) -- cgit v1.2.3 From 75467ee48a5e04cf3ae3cb39aea6adee73aeff91 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Feb 2020 14:54:50 +0100 Subject: dma-direct: improve DMA mask overflow reporting Remove the unset dma_mask case as that won't get into mapping calls anymore, and also report the other errors unconditonally and with a slightly improved message. Remove the now pointless report_addr helper. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk --- kernel/dma/direct.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 594bddd04e01..ac7956c38f69 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -23,18 +23,6 @@ */ unsigned int zone_dma_bits __ro_after_init = 24; -static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) -{ - if (!dev->dma_mask) { - dev_err_once(dev, "DMA map on device without dma_mask\n"); - } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_limit) { - dev_err_once(dev, - "overflow %pad+%zu of DMA mask %llx bus limit %llx\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - } - WARN_ON_ONCE(1); -} - static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys) { @@ -371,7 +359,9 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, if (swiotlb_force != SWIOTLB_NO_FORCE) return swiotlb_map(dev, phys, size, dir, attrs); - report_addr(dev, dma_addr, size); + dev_WARN_ONCE(dev, 1, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); return DMA_MAPPING_ERROR; } @@ -409,7 +399,10 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, dma_addr_t dma_addr = paddr; if (unlikely(!dma_capable(dev, dma_addr, size, false))) { - report_addr(dev, dma_addr, size); + dev_err_once(dev, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + WARN_ON_ONCE(1); return DMA_MAPPING_ERROR; } -- cgit v1.2.3 From 24a9729f831462b1d9d61dc85ecc91c59037243f Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Sat, 1 Feb 2020 12:57:04 +0530 Subject: tracing: Annotate ftrace_graph_hash pointer with __rcu Fix following instances of sparse error kernel/trace/ftrace.c:5664:29: error: incompatible types in comparison kernel/trace/ftrace.c:5785:21: error: incompatible types in comparison kernel/trace/ftrace.c:5864:36: error: incompatible types in comparison kernel/trace/ftrace.c:5866:25: error: incompatible types in comparison Use rcu_dereference_protected to access the __rcu annotated pointer. Link: http://lkml.kernel.org/r/20200201072703.17330-1-frextrite@gmail.com Reviewed-by: Joel Fernandes (Google) Signed-off-by: Amol Grover Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- kernel/trace/trace.h | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0e9612c30995..01d2ecd66161 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5591,7 +5591,7 @@ static const struct file_operations ftrace_notrace_fops = { static DEFINE_MUTEX(graph_lock); -struct ftrace_hash *ftrace_graph_hash = EMPTY_HASH; +struct ftrace_hash __rcu *ftrace_graph_hash = EMPTY_HASH; struct ftrace_hash *ftrace_graph_notrace_hash = EMPTY_HASH; enum graph_filter_type { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f5480a2aa334..18ceab59a5ba 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -964,22 +964,25 @@ extern void __trace_graph_return(struct trace_array *tr, unsigned long flags, int pc); #ifdef CONFIG_DYNAMIC_FTRACE -extern struct ftrace_hash *ftrace_graph_hash; +extern struct ftrace_hash __rcu *ftrace_graph_hash; extern struct ftrace_hash *ftrace_graph_notrace_hash; static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) { unsigned long addr = trace->func; int ret = 0; + struct ftrace_hash *hash; preempt_disable_notrace(); - if (ftrace_hash_empty(ftrace_graph_hash)) { + hash = rcu_dereference_protected(ftrace_graph_hash, !preemptible()); + + if (ftrace_hash_empty(hash)) { ret = 1; goto out; } - if (ftrace_lookup_ip(ftrace_graph_hash, addr)) { + if (ftrace_lookup_ip(hash, addr)) { /* * This needs to be cleared on the return functions -- cgit v1.2.3 From fd0e6852c407dd9aefc594f54ddcc21d84803d3b Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Wed, 5 Feb 2020 11:27:02 +0530 Subject: tracing: Annotate ftrace_graph_notrace_hash pointer with __rcu Fix following instances of sparse error kernel/trace/ftrace.c:5667:29: error: incompatible types in comparison kernel/trace/ftrace.c:5813:21: error: incompatible types in comparison kernel/trace/ftrace.c:5868:36: error: incompatible types in comparison kernel/trace/ftrace.c:5870:25: error: incompatible types in comparison Use rcu_dereference_protected to dereference the newly annotated pointer. Link: http://lkml.kernel.org/r/20200205055701.30195-1-frextrite@gmail.com Signed-off-by: Amol Grover Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- kernel/trace/trace.h | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 01d2ecd66161..481ede3eac13 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5592,7 +5592,7 @@ static const struct file_operations ftrace_notrace_fops = { static DEFINE_MUTEX(graph_lock); struct ftrace_hash __rcu *ftrace_graph_hash = EMPTY_HASH; -struct ftrace_hash *ftrace_graph_notrace_hash = EMPTY_HASH; +struct ftrace_hash __rcu *ftrace_graph_notrace_hash = EMPTY_HASH; enum graph_filter_type { GRAPH_FILTER_NOTRACE = 0, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 18ceab59a5ba..022def96d307 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -965,7 +965,7 @@ extern void __trace_graph_return(struct trace_array *tr, #ifdef CONFIG_DYNAMIC_FTRACE extern struct ftrace_hash __rcu *ftrace_graph_hash; -extern struct ftrace_hash *ftrace_graph_notrace_hash; +extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash; static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) { @@ -1018,10 +1018,14 @@ static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace) static inline int ftrace_graph_notrace_addr(unsigned long addr) { int ret = 0; + struct ftrace_hash *notrace_hash; preempt_disable_notrace(); - if (ftrace_lookup_ip(ftrace_graph_notrace_hash, addr)) + notrace_hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + !preemptible()); + + if (ftrace_lookup_ip(notrace_hash, addr)) ret = 1; preempt_enable_notrace(); -- cgit v1.2.3 From 16052dd5bdfa16dbe18d8c1d4cde2ddab9d23177 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 5 Feb 2020 02:17:57 -0500 Subject: ftrace: Add comment to why rcu_dereference_sched() is open coded Because the function graph tracer can execute in sections where RCU is not "watching", the rcu_dereference_sched() for the has needs to be open coded. This is fine because the RCU "flavor" of the ftrace hash is protected by its own RCU handling (it does its own little synchronization on every CPU and does not rely on RCU sched). Acked-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 022def96d307..8c52f5de9384 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -975,6 +975,11 @@ static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) preempt_disable_notrace(); + /* + * Have to open code "rcu_dereference_sched()" because the + * function graph tracer can be called when RCU is not + * "watching". + */ hash = rcu_dereference_protected(ftrace_graph_hash, !preemptible()); if (ftrace_hash_empty(hash)) { @@ -1022,6 +1027,11 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr) preempt_disable_notrace(); + /* + * Have to open code "rcu_dereference_sched()" because the + * function graph tracer can be called when RCU is not + * "watching". + */ notrace_hash = rcu_dereference_protected(ftrace_graph_notrace_hash, !preemptible()); -- cgit v1.2.3 From 54a16ff6f2e50775145b210bcd94d62c3c2af117 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 5 Feb 2020 09:20:32 -0500 Subject: ftrace: Protect ftrace_graph_hash with ftrace_sync As function_graph tracer can run when RCU is not "watching", it can not be protected by synchronize_rcu() it requires running a task on each CPU before it can be freed. Calling schedule_on_each_cpu(ftrace_sync) needs to be used. Link: https://lore.kernel.org/r/20200205131110.GT2935@paulmck-ThinkPad-P72 Cc: stable@vger.kernel.org Fixes: b9b0c831bed26 ("ftrace: Convert graph filter to use hash tables") Reported-by: "Paul E. McKenney" Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 11 +++++++++-- kernel/trace/trace.h | 2 ++ 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 481ede3eac13..3f7ee102868a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5867,8 +5867,15 @@ ftrace_graph_release(struct inode *inode, struct file *file) mutex_unlock(&graph_lock); - /* Wait till all users are no longer using the old hash */ - synchronize_rcu(); + /* + * We need to do a hard force of sched synchronization. + * This is because we use preempt_disable() to do RCU, but + * the function tracers can be called where RCU is not watching + * (like before user_exit()). We can not rely on the RCU + * infrastructure to do the synchronization, thus we must do it + * ourselves. + */ + schedule_on_each_cpu(ftrace_sync); free_ftrace_hash(old_hash); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8c52f5de9384..3c75d29bd861 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -979,6 +979,7 @@ static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) * Have to open code "rcu_dereference_sched()" because the * function graph tracer can be called when RCU is not * "watching". + * Protected with schedule_on_each_cpu(ftrace_sync) */ hash = rcu_dereference_protected(ftrace_graph_hash, !preemptible()); @@ -1031,6 +1032,7 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr) * Have to open code "rcu_dereference_sched()" because the * function graph tracer can be called when RCU is not * "watching". + * Protected with schedule_on_each_cpu(ftrace_sync) */ notrace_hash = rcu_dereference_protected(ftrace_graph_notrace_hash, !preemptible()); -- cgit v1.2.3 From fcf2736c82ca1908e3a0e74730c404baebd8ccdf Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Thu, 6 Feb 2020 11:40:09 +0000 Subject: Revert "kdb: Get rid of confusing diag msg from "rd" if current task has no regs" This reverts commit bbfceba15f8d1260c328a254efc2b3f2deae4904. When DBG_MAX_REG_NUM is zero then a number of symbols are conditionally defined. It is therefore not possible to check it using C expressions. Reported-by: Anatoly Pugachev Acked-by: Doug Anderson Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index b22292b649c4..ba12e9f4661e 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -543,8 +543,9 @@ int kdbgetaddrarg(int argc, const char **argv, int *nextarg, if (diag) return diag; } else if (symname[0] == '%') { - if (kdb_check_regs()) - return 0; + diag = kdb_check_regs(); + if (diag) + return diag; /* Implement register values with % at a later time as it is * arch optional. */ @@ -1835,7 +1836,8 @@ static int kdb_go(int argc, const char **argv) */ static int kdb_rd(int argc, const char **argv) { - int len = 0; + int len = kdb_check_regs(); +#if DBG_MAX_REG_NUM > 0 int i; char *rname; int rsize; @@ -1844,14 +1846,8 @@ static int kdb_rd(int argc, const char **argv) u16 reg16; u8 reg8; - if (kdb_check_regs()) - return 0; - - /* Fallback to Linux showregs() if we don't have DBG_MAX_REG_NUM */ - if (DBG_MAX_REG_NUM <= 0) { - kdb_dumpregs(kdb_current_regs); - return 0; - } + if (len) + return len; for (i = 0; i < DBG_MAX_REG_NUM; i++) { rsize = dbg_reg_def[i].size * 2; @@ -1893,7 +1889,12 @@ static int kdb_rd(int argc, const char **argv) } } kdb_printf("\n"); +#else + if (len) + return len; + kdb_dumpregs(kdb_current_regs); +#endif return 0; } @@ -1927,8 +1928,9 @@ static int kdb_rm(int argc, const char **argv) if (diag) return diag; - if (kdb_check_regs()) - return 0; + diag = kdb_check_regs(); + if (diag) + return diag; diag = KDB_BADREG; for (i = 0; i < DBG_MAX_REG_NUM; i++) { -- cgit v1.2.3 From 1e474b28e78897d0d170fab3b28ba683149cb9ea Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Feb 2020 06:34:09 -0800 Subject: smp/up: Make smp_call_function_single() match SMP semantics In CONFIG_SMP=y kernels, smp_call_function_single() returns -ENXIO when invoked for a non-existent CPU. In contrast, in CONFIG_SMP=n kernels, a splat is emitted and smp_call_function_single() otherwise silently ignores its "cpu" argument, instead pretending that the caller intended to have something happen on CPU 0. Given that there is now code that expects smp_call_function_single() to return an error if a bad CPU was specified, this difference in semantics needs to be addressed. Bring the semantics of the CONFIG_SMP=n version of smp_call_function_single() into alignment with its CONFIG_SMP=y counterpart. Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200205143409.GA7021@paulmck-ThinkPad-P72 --- kernel/up.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/up.c b/kernel/up.c index 53144d056252..c6f323dcd45b 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -14,7 +14,8 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, { unsigned long flags; - WARN_ON(cpu != 0); + if (cpu != 0) + return -ENXIO; local_irq_save(flags); func(info); -- cgit v1.2.3 From fbc2d1686dc5c2e403091f3c25ca2bc16f88cccb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Dec 2019 21:06:08 -0500 Subject: get rid of cg_invalf() pointless alias for invalf()... Signed-off-by: Al Viro --- kernel/cgroup/cgroup-v1.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 09f3a413f6f8..77eb72b704a6 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -18,8 +18,6 @@ #include -#define cg_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__) - /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls @@ -924,7 +922,7 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->subsys_mask |= (1 << i); return 0; } - return cg_invalf(fc, "cgroup1: Unknown subsys name '%s'", param->key); + return invalf(fc, "cgroup1: Unknown subsys name '%s'", param->key); } if (opt < 0) return opt; @@ -952,7 +950,7 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_release_agent: /* Specifying two release agents is forbidden */ if (ctx->release_agent) - return cg_invalf(fc, "cgroup1: release_agent respecified"); + return invalf(fc, "cgroup1: release_agent respecified"); ctx->release_agent = param->string; param->string = NULL; break; @@ -962,9 +960,9 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) return -ENOENT; /* Can't specify an empty name */ if (!param->size) - return cg_invalf(fc, "cgroup1: Empty name"); + return invalf(fc, "cgroup1: Empty name"); if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1) - return cg_invalf(fc, "cgroup1: Name too long"); + return invalf(fc, "cgroup1: Name too long"); /* Must match [\w.-]+ */ for (i = 0; i < param->size; i++) { char c = param->string[i]; @@ -972,11 +970,11 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) continue; if ((c == '.') || (c == '-') || (c == '_')) continue; - return cg_invalf(fc, "cgroup1: Invalid name"); + return invalf(fc, "cgroup1: Invalid name"); } /* Specifying two names is forbidden */ if (ctx->name) - return cg_invalf(fc, "cgroup1: name respecified"); + return invalf(fc, "cgroup1: name respecified"); ctx->name = param->string; param->string = NULL; break; @@ -1011,7 +1009,7 @@ static int check_cgroupfs_options(struct fs_context *fc) if (ctx->all_ss) { /* Mutually exclusive option 'all' + subsystem name */ if (ctx->subsys_mask) - return cg_invalf(fc, "cgroup1: subsys name conflicts with all"); + return invalf(fc, "cgroup1: subsys name conflicts with all"); /* 'all' => select all the subsystems */ ctx->subsys_mask = enabled; } @@ -1021,7 +1019,7 @@ static int check_cgroupfs_options(struct fs_context *fc) * empty hierarchies must have a name). */ if (!ctx->subsys_mask && !ctx->name) - return cg_invalf(fc, "cgroup1: Need name or subsystem set"); + return invalf(fc, "cgroup1: Need name or subsystem set"); /* * Option noprefix was introduced just for backward compatibility @@ -1029,11 +1027,11 @@ static int check_cgroupfs_options(struct fs_context *fc) * the cpuset subsystem. */ if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask)) - return cg_invalf(fc, "cgroup1: noprefix used incorrectly"); + return invalf(fc, "cgroup1: noprefix used incorrectly"); /* Can't specify "none" and some subsystems */ if (ctx->subsys_mask && ctx->none) - return cg_invalf(fc, "cgroup1: none used incorrectly"); + return invalf(fc, "cgroup1: none used incorrectly"); return 0; } @@ -1063,7 +1061,7 @@ int cgroup1_reconfigure(struct fs_context *fc) /* Don't allow flags or name to change at remount */ if ((ctx->flags ^ root->flags) || (ctx->name && strcmp(ctx->name, root->name))) { - cg_invalf(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"", + errorf(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"", ctx->flags, ctx->name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; @@ -1180,7 +1178,7 @@ static int cgroup1_root_to_use(struct fs_context *fc) * can't create new one without subsys specification. */ if (!ctx->subsys_mask && !ctx->none) - return cg_invalf(fc, "cgroup1: No subsys list or none specified"); + return invalf(fc, "cgroup1: No subsys list or none specified"); /* Hierarchies may only be created in the initial cgroup namespace. */ if (ctx->ns != &init_cgroup_ns) -- cgit v1.2.3 From 96cafb9ccb153f6a82ff2c9bde68916d9d65501e Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 6 Dec 2019 10:45:01 -0600 Subject: fs_parser: remove fs_parameter_description name field Unused now. Signed-off-by: Eric Sandeen Acked-by: David Howells Signed-off-by: Al Viro --- kernel/bpf/inode.c | 1 - kernel/cgroup/cgroup-v1.c | 1 - kernel/cgroup/cgroup.c | 1 - 3 files changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index ecf42bec38c0..9608aa48128d 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -593,7 +593,6 @@ static const struct fs_parameter_spec bpf_param_specs[] = { }; static const struct fs_parameter_description bpf_fs_parameters = { - .name = "bpf", .specs = bpf_param_specs, }; diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 77eb72b704a6..c7b526f33621 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -898,7 +898,6 @@ static const struct fs_parameter_spec cgroup1_param_specs[] = { }; const struct fs_parameter_description cgroup1_fs_parameters = { - .name = "cgroup1", .specs = cgroup1_param_specs, }; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 735af8f15f95..d86d441d93ca 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1823,7 +1823,6 @@ static const struct fs_parameter_spec cgroup2_param_specs[] = { }; static const struct fs_parameter_description cgroup2_fs_parameters = { - .name = "cgroup2", .specs = cgroup2_param_specs, }; -- cgit v1.2.3 From d7167b149943e38ad610191ecbb0800c78bbced9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 7 Sep 2019 07:23:15 -0400 Subject: fs_parse: fold fs_parameter_desc/fs_parameter_spec The former contains nothing but a pointer to an array of the latter... Signed-off-by: Al Viro --- kernel/bpf/inode.c | 10 +++------- kernel/cgroup/cgroup-internal.h | 4 ++-- kernel/cgroup/cgroup-v1.c | 8 ++------ kernel/cgroup/cgroup.c | 12 ++++-------- 4 files changed, 11 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9608aa48128d..f4b2ef72e265 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -587,15 +587,11 @@ enum { OPT_MODE, }; -static const struct fs_parameter_spec bpf_param_specs[] = { +static const struct fs_parameter_spec bpf_fs_parameters[] = { fsparam_u32oct ("mode", OPT_MODE), {} }; -static const struct fs_parameter_description bpf_fs_parameters = { - .specs = bpf_param_specs, -}; - struct bpf_mount_opts { umode_t mode; }; @@ -606,7 +602,7 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) struct fs_parse_result result; int opt; - opt = fs_parse(fc, &bpf_fs_parameters, param, &result); + opt = fs_parse(fc, bpf_fs_parameters, param, &result); if (opt < 0) /* We might like to report bad mount options here, but * traditionally we've ignored all mount options, so we'd @@ -682,7 +678,7 @@ static struct file_system_type bpf_fs_type = { .owner = THIS_MODULE, .name = "bpf", .init_fs_context = bpf_init_fs_context, - .parameters = &bpf_fs_parameters, + .parameters = bpf_fs_parameters, .kill_sb = kill_litter_super, }; diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 90d1710fef6c..bfbeabc17a9d 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #define TRACE_CGROUP_PATH_LEN 1024 extern spinlock_t trace_cgroup_path_lock; @@ -265,7 +265,7 @@ extern const struct proc_ns_operations cgroupns_operations; */ extern struct cftype cgroup1_base_files[]; extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; -extern const struct fs_parameter_description cgroup1_fs_parameters; +extern const struct fs_parameter_spec cgroup1_fs_parameters[]; int proc_cgroupstats_show(struct seq_file *m, void *v); bool cgroup1_ssid_disabled(int ssid); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index c7b526f33621..408545620dad 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -885,7 +885,7 @@ enum cgroup1_param { Opt_xattr, }; -static const struct fs_parameter_spec cgroup1_param_specs[] = { +const struct fs_parameter_spec cgroup1_fs_parameters[] = { fsparam_flag ("all", Opt_all), fsparam_flag ("clone_children", Opt_clone_children), fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), @@ -897,10 +897,6 @@ static const struct fs_parameter_spec cgroup1_param_specs[] = { {} }; -const struct fs_parameter_description cgroup1_fs_parameters = { - .specs = cgroup1_param_specs, -}; - int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); @@ -908,7 +904,7 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) struct fs_parse_result result; int opt, i; - opt = fs_parse(fc, &cgroup1_fs_parameters, param, &result); + opt = fs_parse(fc, cgroup1_fs_parameters, param, &result); if (opt == -ENOPARAM) { if (strcmp(param->key, "source") == 0) { fc->source = param->string; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d86d441d93ca..a70a37e85d11 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1816,23 +1816,19 @@ enum cgroup2_param { nr__cgroup2_params }; -static const struct fs_parameter_spec cgroup2_param_specs[] = { +static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("nsdelegate", Opt_nsdelegate), fsparam_flag("memory_localevents", Opt_memory_localevents), {} }; -static const struct fs_parameter_description cgroup2_fs_parameters = { - .specs = cgroup2_param_specs, -}; - static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct fs_parse_result result; int opt; - opt = fs_parse(fc, &cgroup2_fs_parameters, param, &result); + opt = fs_parse(fc, cgroup2_fs_parameters, param, &result); if (opt < 0) return opt; @@ -2155,7 +2151,7 @@ static void cgroup_kill_sb(struct super_block *sb) struct file_system_type cgroup_fs_type = { .name = "cgroup", .init_fs_context = cgroup_init_fs_context, - .parameters = &cgroup1_fs_parameters, + .parameters = cgroup1_fs_parameters, .kill_sb = cgroup_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; @@ -2163,7 +2159,7 @@ struct file_system_type cgroup_fs_type = { static struct file_system_type cgroup2_fs_type = { .name = "cgroup2", .init_fs_context = cgroup_init_fs_context, - .parameters = &cgroup2_fs_parameters, + .parameters = cgroup2_fs_parameters, .kill_sb = cgroup_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; -- cgit v1.2.3 From 58c025f0e803a45453bb5ada957cbf98163d3048 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 21 Dec 2019 21:35:27 -0500 Subject: cgroup1: switch to use of errorfc() et.al. Signed-off-by: Al Viro --- kernel/cgroup/cgroup-v1.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 408545620dad..be1a1c83cdd1 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -917,7 +917,7 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->subsys_mask |= (1 << i); return 0; } - return invalf(fc, "cgroup1: Unknown subsys name '%s'", param->key); + return invalfc(fc, "Unknown subsys name '%s'", param->key); } if (opt < 0) return opt; @@ -945,7 +945,7 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_release_agent: /* Specifying two release agents is forbidden */ if (ctx->release_agent) - return invalf(fc, "cgroup1: release_agent respecified"); + return invalfc(fc, "release_agent respecified"); ctx->release_agent = param->string; param->string = NULL; break; @@ -955,9 +955,9 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) return -ENOENT; /* Can't specify an empty name */ if (!param->size) - return invalf(fc, "cgroup1: Empty name"); + return invalfc(fc, "Empty name"); if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1) - return invalf(fc, "cgroup1: Name too long"); + return invalfc(fc, "Name too long"); /* Must match [\w.-]+ */ for (i = 0; i < param->size; i++) { char c = param->string[i]; @@ -965,11 +965,11 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) continue; if ((c == '.') || (c == '-') || (c == '_')) continue; - return invalf(fc, "cgroup1: Invalid name"); + return invalfc(fc, "Invalid name"); } /* Specifying two names is forbidden */ if (ctx->name) - return invalf(fc, "cgroup1: name respecified"); + return invalfc(fc, "name respecified"); ctx->name = param->string; param->string = NULL; break; @@ -1004,7 +1004,7 @@ static int check_cgroupfs_options(struct fs_context *fc) if (ctx->all_ss) { /* Mutually exclusive option 'all' + subsystem name */ if (ctx->subsys_mask) - return invalf(fc, "cgroup1: subsys name conflicts with all"); + return invalfc(fc, "subsys name conflicts with all"); /* 'all' => select all the subsystems */ ctx->subsys_mask = enabled; } @@ -1014,7 +1014,7 @@ static int check_cgroupfs_options(struct fs_context *fc) * empty hierarchies must have a name). */ if (!ctx->subsys_mask && !ctx->name) - return invalf(fc, "cgroup1: Need name or subsystem set"); + return invalfc(fc, "Need name or subsystem set"); /* * Option noprefix was introduced just for backward compatibility @@ -1022,11 +1022,11 @@ static int check_cgroupfs_options(struct fs_context *fc) * the cpuset subsystem. */ if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask)) - return invalf(fc, "cgroup1: noprefix used incorrectly"); + return invalfc(fc, "noprefix used incorrectly"); /* Can't specify "none" and some subsystems */ if (ctx->subsys_mask && ctx->none) - return invalf(fc, "cgroup1: none used incorrectly"); + return invalfc(fc, "none used incorrectly"); return 0; } @@ -1056,7 +1056,7 @@ int cgroup1_reconfigure(struct fs_context *fc) /* Don't allow flags or name to change at remount */ if ((ctx->flags ^ root->flags) || (ctx->name && strcmp(ctx->name, root->name))) { - errorf(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"", + errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"", ctx->flags, ctx->name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; @@ -1173,7 +1173,7 @@ static int cgroup1_root_to_use(struct fs_context *fc) * can't create new one without subsys specification. */ if (!ctx->subsys_mask && !ctx->none) - return invalf(fc, "cgroup1: No subsys list or none specified"); + return invalfc(fc, "No subsys list or none specified"); /* Hierarchies may only be created in the initial cgroup namespace. */ if (ctx->ns != &init_cgroup_ns) -- cgit v1.2.3 From f9f21cea311340f38074ff93a8d89b4a9cae6bcc Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 6 Feb 2020 11:15:21 -0800 Subject: genirq: Clarify that irq wake state is orthogonal to enable/disable There's some confusion around if an irq that's disabled with disable_irq() can still wake the system from sleep states such as "suspend to RAM". Clarify this in the kernel documentation for irq_set_irq_wake() so that it's clear that an irq can be disabled and still wake the system if it has been marked for wakeup. Signed-off-by: Stephen Boyd Signed-off-by: Thomas Gleixner Reviewed-by: Douglas Anderson Link: https://lkml.kernel.org/r/20200206191521.94559-1-swboyd@chromium.org --- kernel/irq/manage.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 818b2802d3e7..3089a60ea8f9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -731,6 +731,13 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) * * Wakeup mode lets this IRQ wake the system from sleep * states like "suspend to RAM". + * + * Note: irq enable/disable state is completely orthogonal + * to the enable/disable state of irq wake. An irq can be + * disabled with disable_irq() and still wake the system as + * long as the irq has wake enabled. If this does not hold, + * then the underlying irq chip and the related driver need + * to be investigated. */ int irq_set_irq_wake(unsigned int irq, unsigned int on) { -- cgit v1.2.3 From 52262ee567ad14c9606be25f3caddcefa3c514e4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 28 Jan 2020 15:40:06 +0000 Subject: sched/fair: Allow a per-CPU kthread waking a task to stack on the same CPU, to fix XFS performance regression The following XFS commit: 8ab39f11d974 ("xfs: prevent CIL push holdoff in log recovery") changed the logic from using bound workqueues to using unbound workqueues. Functionally this makes sense but it was observed at the time that the dbench performance dropped quite a lot and CPU migrations were increased. The current pattern of the task migration is straight-forward. With XFS, an IO issuer delegates work to xlog_cil_push_work ()on an unbound kworker. This runs on a nearby CPU and on completion, dbench wakes up on its old CPU as it is still idle and no migration occurs. dbench then queues the real IO on the blk_mq_requeue_work() work item which runs on a bound kworker which is forced to run on the same CPU as dbench. When IO completes, the bound kworker wakes dbench but as the kworker is a bound but, real task, the CPU is not considered idle and dbench gets migrated by select_idle_sibling() to a new CPU. dbench may ping-pong between two CPUs for a while but ultimately it starts a round-robin of all CPUs sharing the same LLC. High-frequency migration on each IO completion has poor performance overall. It has negative implications both in commication costs and power management. mpstat confirmed that at low thread counts that all CPUs sharing an LLC has low level of activity. Note that even if the CIL patch was reverted, there still would be migrations but the impact is less noticeable. It turns out that individually the scheduler, XFS, blk-mq and workqueues all made sensible decisions but in combination, the overall effect was sub-optimal. This patch special cases the IO issue/completion pattern and allows a bound kworker waker and a task wakee to stack on the same CPU if there is a strong chance they are directly related. The expectation is that the kworker is likely going back to sleep shortly. This is not guaranteed as the IO could be queued asynchronously but there is a very strong relationship between the task and kworker in this case that would justify stacking on the same CPU instead of migrating. There should be few concerns about kworker starvation given that the special casing is only when the kworker is the waker. DBench on XFS MMTests config: io-dbench4-async modified to run on a fresh XFS filesystem UMA machine with 8 cores sharing LLC 5.5.0-rc7 5.5.0-rc7 tipsched-20200124 kworkerstack Amean 1 22.63 ( 0.00%) 20.54 * 9.23%* Amean 2 25.56 ( 0.00%) 23.40 * 8.44%* Amean 4 28.63 ( 0.00%) 27.85 * 2.70%* Amean 8 37.66 ( 0.00%) 37.68 ( -0.05%) Amean 64 469.47 ( 0.00%) 468.26 ( 0.26%) Stddev 1 1.00 ( 0.00%) 0.72 ( 28.12%) Stddev 2 1.62 ( 0.00%) 1.97 ( -21.54%) Stddev 4 2.53 ( 0.00%) 3.58 ( -41.19%) Stddev 8 5.30 ( 0.00%) 5.20 ( 1.92%) Stddev 64 86.36 ( 0.00%) 94.53 ( -9.46%) NUMA machine, 48 CPUs total, 24 CPUs share cache 5.5.0-rc7 5.5.0-rc7 tipsched-20200124 kworkerstack-v1r2 Amean 1 58.69 ( 0.00%) 30.21 * 48.53%* Amean 2 60.90 ( 0.00%) 35.29 * 42.05%* Amean 4 66.77 ( 0.00%) 46.55 * 30.28%* Amean 8 81.41 ( 0.00%) 68.46 * 15.91%* Amean 16 113.29 ( 0.00%) 107.79 * 4.85%* Amean 32 199.10 ( 0.00%) 198.22 * 0.44%* Amean 64 478.99 ( 0.00%) 477.06 * 0.40%* Amean 128 1345.26 ( 0.00%) 1372.64 * -2.04%* Stddev 1 2.64 ( 0.00%) 4.17 ( -58.08%) Stddev 2 4.35 ( 0.00%) 5.38 ( -23.73%) Stddev 4 6.77 ( 0.00%) 6.56 ( 3.00%) Stddev 8 11.61 ( 0.00%) 10.91 ( 6.04%) Stddev 16 18.63 ( 0.00%) 19.19 ( -3.01%) Stddev 32 38.71 ( 0.00%) 38.30 ( 1.06%) Stddev 64 100.28 ( 0.00%) 91.24 ( 9.02%) Stddev 128 186.87 ( 0.00%) 160.34 ( 14.20%) Dbench has been modified to report the time to complete a single "load file". This is a more meaningful metric for dbench that a throughput metric as the benchmark makes many different system calls that are not throughput-related Patch shows a 9.23% and 48.53% reduction in the time to process a load file with the difference partially explained by the number of CPUs sharing a LLC. In a separate run, task migrations were almost eliminated by the patch for low client counts. In case people have issue with the metric used for the benchmark, this is a comparison of the throughputs as reported by dbench on the NUMA machine. dbench4 Throughput (misleading but traditional) 5.5.0-rc7 5.5.0-rc7 tipsched-20200124 kworkerstack-v1r2 Hmean 1 321.41 ( 0.00%) 617.82 * 92.22%* Hmean 2 622.87 ( 0.00%) 1066.80 * 71.27%* Hmean 4 1134.56 ( 0.00%) 1623.74 * 43.12%* Hmean 8 1869.96 ( 0.00%) 2212.67 * 18.33%* Hmean 16 2673.11 ( 0.00%) 2806.13 * 4.98%* Hmean 32 3032.74 ( 0.00%) 3039.54 ( 0.22%) Hmean 64 2514.25 ( 0.00%) 2498.96 * -0.61%* Hmean 128 1778.49 ( 0.00%) 1746.05 * -1.82%* Note that this is somewhat specific to XFS and ext4 shows no performance difference as it does not rely on kworkers in the same way. No major problem was observed running other workloads on different machines although not all tests have completed yet. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200128154006.GD3466@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 ----------- kernel/sched/fair.c | 14 ++++++++++++++ kernel/sched/sched.h | 13 +++++++++++++ 3 files changed, 27 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 89e54f3ed571..1a9983da4408 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1447,17 +1447,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP -static inline bool is_per_cpu_kthread(struct task_struct *p) -{ - if (!(p->flags & PF_KTHREAD)) - return false; - - if (p->nr_cpus_allowed != 1) - return false; - - return true; -} - /* * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 25dffc03f0f6..94c3b8469cf6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5912,6 +5912,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) (available_idle_cpu(prev) || sched_idle_cpu(prev))) return prev; + /* + * Allow a per-cpu kthread to stack with the wakee if the + * kworker thread and the tasks previous CPUs are the same. + * The assumption is that the wakee queued work for the + * per-cpu kthread that is now complete and the wakeup is + * essentially a sync wakeup. An obvious example of this + * pattern is IO completions. + */ + if (is_per_cpu_kthread(current) && + prev == smp_processor_id() && + this_rq()->nr_running <= 1) { + return prev; + } + /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; if (recent_used_cpu != prev && diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1a88dc8ad11b..5876e6ba5903 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2479,3 +2479,16 @@ static inline void membarrier_switch_mm(struct rq *rq, { } #endif + +#ifdef CONFIG_SMP +static inline bool is_per_cpu_kthread(struct task_struct *p) +{ + if (!(p->flags & PF_KTHREAD)) + return false; + + if (p->nr_cpus_allowed != 1) + return false; + + return true; +} +#endif -- cgit v1.2.3 From 10f129cb59cf6b4aee419ce4b1325fc532d975fb Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 5 Feb 2020 16:34:04 -0600 Subject: tracing/kprobe: Fix uninitialized variable bug There is a potential execution path in which variable *ret* is returned without being properly initialized, previously. Fix this by initializing variable *ret* to 0. Link: http://lkml.kernel.org/r/20200205223404.GA3379@embeddedor Addresses-Coverity-ID: 1491142 ("Uninitialized scalar variable") Fixes: 2a588dd1d5d6 ("tracing: Add kprobe event command generation functions") Reviewed-by: Tom Zanussi Acked-by: Masami Hiramatsu Signed-off-by: Gustavo A. R. Silva Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 51efc790aea8..21bafd48f2ac 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1012,7 +1012,7 @@ int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...) { struct dynevent_arg arg; va_list args; - int ret; + int ret = 0; if (cmd->type != DYNEVENT_TYPE_KPROBE) return -EINVAL; -- cgit v1.2.3 From 2bf0eb9b3b0d099b20b2c4736436b666d78b94d5 Mon Sep 17 00:00:00 2001 From: Hongbo Yao Date: Mon, 10 Feb 2020 09:14:41 +0800 Subject: bpf: Make btf_check_func_type_match() static Fix the following sparse warning: kernel/bpf/btf.c:4131:5: warning: symbol 'btf_check_func_type_match' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Hongbo Yao Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200210011441.147102-1-yaohongbo@huawei.com --- kernel/bpf/btf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 805c43b083e9..787140095e58 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4142,9 +4142,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, * EFAULT - verifier bug * 0 - 99% match. The last 1% is validated by the verifier. */ -int btf_check_func_type_match(struct bpf_verifier_log *log, - struct btf *btf1, const struct btf_type *t1, - struct btf *btf2, const struct btf_type *t2) +static int btf_check_func_type_match(struct bpf_verifier_log *log, + struct btf *btf1, const struct btf_type *t1, + struct btf *btf2, const struct btf_type *t2) { const struct btf_param *args1, *args2; const char *fn1, *fn2, *s1, *s2; -- cgit v1.2.3 From d090409abbdd1fcbdfd6ed66612390ba8c814749 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 10 Feb 2020 17:06:48 -0600 Subject: tracing: Add missing nest end to synth_event_trace_start() error case If the ring_buffer reserve in synth_event_trace_start() fails, the matching ring_buffer_nest_end() should be called in the error code, since nothing else will ever call it in this case. Link: http://lkml.kernel.org/r/20abc444b3eeff76425f895815380abe7aa53ff8.1581374549.git.zanussi@kernel.org Fixes: 8dcc53ad956d2 ("tracing: Add synth_event_trace() and related functions") Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index b3bcfd8c7332..a546ffa14785 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2043,6 +2043,7 @@ int synth_event_trace_start(struct trace_event_file *file, entry = trace_event_buffer_reserve(&trace_state->fbuffer, file, sizeof(*entry) + fields_size); if (!entry) { + ring_buffer_nest_end(trace_state->buffer); ret = -EINVAL; goto out; } -- cgit v1.2.3 From 0c62f6cd9ed320cb0ca39e33addf3a3da51b7328 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 10 Feb 2020 17:06:49 -0600 Subject: tracing: Don't return -EINVAL when tracing soft disabled synth events There's no reason to return -EINVAL when tracing a synthetic event if it's soft disabled - treat it the same as if it were hard disabled and return normally. Have synth_event_trace() and synth_event_trace_array() just return normally, and have synth_event_trace_start set the trace state to disabled and return. Link: http://lkml.kernel.org/r/df5d02a1625aff97c9866506c5bada6a069982ba.1581374549.git.zanussi@kernel.org Fixes: 8dcc53ad956d2 ("tracing: Add synth_event_trace() and related functions") Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a546ffa14785..99a02168599b 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1828,7 +1828,8 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) * called directly by the user, we don't have that but we * still need to honor not logging when disabled. */ - if (!(file->flags & EVENT_FILE_FL_ENABLED)) + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) return 0; event = file->event_call->data; @@ -1836,9 +1837,6 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) if (n_vals != event->n_fields) return -EINVAL; - if (trace_trigger_soft_disabled(file)) - return -EINVAL; - fields_size = event->n_u64 * sizeof(u64); /* @@ -1918,7 +1916,8 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, * called directly by the user, we don't have that but we * still need to honor not logging when disabled. */ - if (!(file->flags & EVENT_FILE_FL_ENABLED)) + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) return 0; event = file->event_call->data; @@ -1926,9 +1925,6 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, if (n_vals != event->n_fields) return -EINVAL; - if (trace_trigger_soft_disabled(file)) - return -EINVAL; - fields_size = event->n_u64 * sizeof(u64); /* @@ -2017,7 +2013,8 @@ int synth_event_trace_start(struct trace_event_file *file, * trace case, we save the enabed state upon start and just * ignore the following data calls. */ - if (!(file->flags & EVENT_FILE_FL_ENABLED)) { + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) { trace_state->enabled = false; goto out; } @@ -2026,11 +2023,6 @@ int synth_event_trace_start(struct trace_event_file *file, trace_state->event = file->event_call->data; - if (trace_trigger_soft_disabled(file)) { - ret = -EINVAL; - goto out; - } - fields_size = trace_state->event->n_u64 * sizeof(u64); /* -- cgit v1.2.3 From 7276531d4036f5db2af15c8b6caa02e7741f5d80 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 10 Feb 2020 17:06:50 -0600 Subject: tracing: Consolidate trace() functions Move the checking, buffer reserve and buffer commit code in synth_event_trace_start/end() into inline functions __synth_event_trace_start/end() so they can also be used by synth_event_trace() and synth_event_trace_array(), and then have all those functions use them. Also, change synth_event_trace_state.enabled to disabled so it only needs to be set if the event is disabled, which is not normally the case. Link: http://lkml.kernel.org/r/b1f3108d0f450e58192955a300e31d0405ab4149.1581374549.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 220 +++++++++++++++------------------------ 1 file changed, 86 insertions(+), 134 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 99a02168599b..65b54d6a1422 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1791,6 +1791,60 @@ void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) } EXPORT_SYMBOL_GPL(synth_event_cmd_init); +static inline int +__synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) +{ + int entry_size, fields_size = 0; + int ret = 0; + + /* + * Normal event tracing doesn't get called at all unless the + * ENABLED bit is set (which attaches the probe thus allowing + * this code to be called, etc). Because this is called + * directly by the user, we don't have that but we still need + * to honor not logging when disabled. For the the iterated + * trace case, we save the enabed state upon start and just + * ignore the following data calls. + */ + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) { + trace_state->disabled = true; + ret = -ENOENT; + goto out; + } + + trace_state->event = file->event_call->data; + + fields_size = trace_state->event->n_u64 * sizeof(u64); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + trace_state->buffer = file->tr->array_buffer.buffer; + ring_buffer_nest_start(trace_state->buffer); + + entry_size = sizeof(*trace_state->entry) + fields_size; + trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer, + file, + entry_size); + if (!trace_state->entry) { + ring_buffer_nest_end(trace_state->buffer); + ret = -EINVAL; + } +out: + return ret; +} + +static inline void +__synth_event_trace_end(struct synth_event_trace_state *trace_state) +{ + trace_event_buffer_commit(&trace_state->fbuffer); + + ring_buffer_nest_end(trace_state->buffer); +} + /** * synth_event_trace - Trace a synthetic event * @file: The trace_event_file representing the synthetic event @@ -1812,69 +1866,38 @@ EXPORT_SYMBOL_GPL(synth_event_cmd_init); */ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) { - struct trace_event_buffer fbuffer; - struct synth_trace_event *entry; - struct trace_buffer *buffer; - struct synth_event *event; + struct synth_event_trace_state state; unsigned int i, n_u64; - int fields_size = 0; va_list args; - int ret = 0; - - /* - * Normal event generation doesn't get called at all unless - * the ENABLED bit is set (which attaches the probe thus - * allowing this code to be called, etc). Because this is - * called directly by the user, we don't have that but we - * still need to honor not logging when disabled. - */ - if (!(file->flags & EVENT_FILE_FL_ENABLED) || - trace_trigger_soft_disabled(file)) - return 0; - - event = file->event_call->data; - - if (n_vals != event->n_fields) - return -EINVAL; - - fields_size = event->n_u64 * sizeof(u64); - - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - buffer = file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + int ret; - entry = trace_event_buffer_reserve(&fbuffer, file, - sizeof(*entry) + fields_size); - if (!entry) { - ret = -EINVAL; - goto out; + ret = __synth_event_trace_start(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; } va_start(args, n_vals); - for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { u64 val; val = va_arg(args, u64); - if (event->fields[i]->is_string) { + if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)val; - char *str_field = (char *)&entry->fields[n_u64]; + char *str_field = (char *)&state.entry->fields[n_u64]; strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - entry->fields[n_u64] = val; + state.entry->fields[n_u64] = val; n_u64++; } } va_end(args); - trace_event_buffer_commit(&fbuffer); -out: - ring_buffer_nest_end(buffer); + __synth_event_trace_end(&state); return ret; } @@ -1901,62 +1924,31 @@ EXPORT_SYMBOL_GPL(synth_event_trace); int synth_event_trace_array(struct trace_event_file *file, u64 *vals, unsigned int n_vals) { - struct trace_event_buffer fbuffer; - struct synth_trace_event *entry; - struct trace_buffer *buffer; - struct synth_event *event; + struct synth_event_trace_state state; unsigned int i, n_u64; - int fields_size = 0; - int ret = 0; - - /* - * Normal event generation doesn't get called at all unless - * the ENABLED bit is set (which attaches the probe thus - * allowing this code to be called, etc). Because this is - * called directly by the user, we don't have that but we - * still need to honor not logging when disabled. - */ - if (!(file->flags & EVENT_FILE_FL_ENABLED) || - trace_trigger_soft_disabled(file)) - return 0; - - event = file->event_call->data; - - if (n_vals != event->n_fields) - return -EINVAL; - - fields_size = event->n_u64 * sizeof(u64); - - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - buffer = file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + int ret; - entry = trace_event_buffer_reserve(&fbuffer, file, - sizeof(*entry) + fields_size); - if (!entry) { - ret = -EINVAL; - goto out; + ret = __synth_event_trace_start(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; } - for (i = 0, n_u64 = 0; i < event->n_fields; i++) { - if (event->fields[i]->is_string) { + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { + if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)vals[i]; - char *str_field = (char *)&entry->fields[n_u64]; + char *str_field = (char *)&state.entry->fields[n_u64]; strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - entry->fields[n_u64] = vals[i]; + state.entry->fields[n_u64] = vals[i]; n_u64++; } } - trace_event_buffer_commit(&fbuffer); -out: - ring_buffer_nest_end(buffer); + __synth_event_trace_end(&state); return ret; } @@ -1993,55 +1985,17 @@ EXPORT_SYMBOL_GPL(synth_event_trace_array); int synth_event_trace_start(struct trace_event_file *file, struct synth_event_trace_state *trace_state) { - struct synth_trace_event *entry; - int fields_size = 0; - int ret = 0; + int ret; - if (!trace_state) { - ret = -EINVAL; - goto out; - } + if (!trace_state) + return -EINVAL; memset(trace_state, '\0', sizeof(*trace_state)); - /* - * Normal event tracing doesn't get called at all unless the - * ENABLED bit is set (which attaches the probe thus allowing - * this code to be called, etc). Because this is called - * directly by the user, we don't have that but we still need - * to honor not logging when disabled. For the the iterated - * trace case, we save the enabed state upon start and just - * ignore the following data calls. - */ - if (!(file->flags & EVENT_FILE_FL_ENABLED) || - trace_trigger_soft_disabled(file)) { - trace_state->enabled = false; - goto out; - } - - trace_state->enabled = true; + ret = __synth_event_trace_start(file, trace_state); + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ - trace_state->event = file->event_call->data; - - fields_size = trace_state->event->n_u64 * sizeof(u64); - - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - trace_state->buffer = file->tr->array_buffer.buffer; - ring_buffer_nest_start(trace_state->buffer); - - entry = trace_event_buffer_reserve(&trace_state->fbuffer, file, - sizeof(*entry) + fields_size); - if (!entry) { - ring_buffer_nest_end(trace_state->buffer); - ret = -EINVAL; - goto out; - } - - trace_state->entry = entry; -out: return ret; } EXPORT_SYMBOL_GPL(synth_event_trace_start); @@ -2074,7 +2028,7 @@ static int __synth_event_add_val(const char *field_name, u64 val, trace_state->add_next = true; } - if (!trace_state->enabled) + if (trace_state->disabled) goto out; event = trace_state->event; @@ -2209,9 +2163,7 @@ int synth_event_trace_end(struct synth_event_trace_state *trace_state) if (!trace_state) return -EINVAL; - trace_event_buffer_commit(&trace_state->fbuffer); - - ring_buffer_nest_end(trace_state->buffer); + __synth_event_trace_end(trace_state); return 0; } -- cgit v1.2.3 From e3728b50cd9be7d4b1469447cdf1feb93e3b7adb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Feb 2020 10:11:02 +0100 Subject: ACPI: PM: s2idle: Avoid possible race related to the EC GPE It is theoretically possible for the ACPI EC GPE to be set after the s2idle_ops->wake() called from s2idle_loop() has returned and before the subsequent pm_wakeup_pending() check is carried out. If that happens, the resulting wakeup event will cause the system to resume even though it may be a spurious one. To avoid that race, first make the ->wake() callback in struct platform_s2idle_ops return a bool value indicating whether or not to let the system resume and rearrange s2idle_loop() to use that value instad of the direct pm_wakeup_pending() call if ->wake() is present. Next, rework acpi_s2idle_wake() to process EC events and check pm_wakeup_pending() before re-arming the SCI for system wakeup to prevent it from triggering prematurely and add comments to that function to explain the rationale for the new code flow. Fixes: 56b991849009 ("PM: sleep: Simplify suspend-to-idle control flow") Cc: 5.4+ # 5.4+ Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 2c47280fbfc7..8b1bb5ee7e5d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -131,11 +131,12 @@ static void s2idle_loop(void) * to avoid them upfront. */ for (;;) { - if (s2idle_ops && s2idle_ops->wake) - s2idle_ops->wake(); - - if (pm_wakeup_pending()) + if (s2idle_ops && s2idle_ops->wake) { + if (s2idle_ops->wake()) + break; + } else if (pm_wakeup_pending()) { break; + } pm_wakeup_clear(false); -- cgit v1.2.3 From 6fcca0fa48118e6d63733eb4644c6cd880c15b8f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 3 Feb 2020 13:22:16 -0800 Subject: sched/psi: Fix OOB write when writing 0 bytes to PSI files Issuing write() with count parameter set to 0 on any file under /proc/pressure/ will cause an OOB write because of the access to buf[buf_size-1] when NUL-termination is performed. Fix this by checking for buf_size to be non-zero. Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20200203212216.7076-1-surenb@google.com --- kernel/sched/psi.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index db7b50bba3f1..38ccd49b9bf6 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1199,6 +1199,9 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; + if (!nbytes) + return -EINVAL; + buf_size = min(nbytes, sizeof(buf)); if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; -- cgit v1.2.3 From 4104a562e0ca62e971089db9d3c47794a0d7d4eb Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Sat, 1 Feb 2020 18:28:03 +0530 Subject: sched/core: Annotate curr pointer in rq with __rcu This patch fixes the following sparse warnings in sched/core.c and sched/membarrier.c: kernel/sched/core.c:2372:27: error: incompatible types in comparison expression kernel/sched/core.c:4061:17: error: incompatible types in comparison expression kernel/sched/core.c:6067:9: error: incompatible types in comparison expression kernel/sched/membarrier.c:108:21: error: incompatible types in comparison expression kernel/sched/membarrier.c:177:21: error: incompatible types in comparison expression kernel/sched/membarrier.c:243:21: error: incompatible types in comparison expression Signed-off-by: Madhuparna Bhowmik Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200201125803.20245-1-madhuparnabhowmik10@gmail.com --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5876e6ba5903..9ea647835fd6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -896,7 +896,7 @@ struct rq { */ unsigned long nr_uninterruptible; - struct task_struct *curr; + struct task_struct __rcu *curr; struct task_struct *idle; struct task_struct *stop; unsigned long next_balance; -- cgit v1.2.3 From e9f5490c3574b435ce7fe7a71724aa3866babc7f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 9 Feb 2020 19:29:12 -0800 Subject: sched/fair: Fix kernel-doc warning in attach_entity_load_avg() Fix kernel-doc warning in kernel/sched/fair.c, caused by a recent function parameter removal: ../kernel/sched/fair.c:3526: warning: Excess function parameter 'flags' description in 'attach_entity_load_avg' Fixes: a4f9a0e51bbf ("sched/fair: Remove redundant call to cpufreq_update_util()") Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/cbe964e4-6879-fd08-41c9-ef1917414af4@infradead.org --- kernel/sched/fair.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 94c3b8469cf6..3c8a379c357e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3516,7 +3516,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to * @se: sched_entity to attach - * @flags: migration hints * * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. -- cgit v1.2.3 From db8dd9697238be70a6b4f9d0284cd89f59c0e070 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 30 Jan 2020 13:34:49 +0300 Subject: cgroup-v1: cgroup_pidlist_next should update position index if seq_file .next fuction does not change position index, read after some lseek can generate unexpected output. # mount | grep cgroup # dd if=/mnt/cgroup.procs bs=1 # normal output ... 1294 1295 1296 1304 1382 584+0 records in 584+0 records out 584 bytes copied dd: /mnt/cgroup.procs: cannot skip to specified offset 83 <<< generates end of last line 1383 <<< ... and whole last line once again 0+1 records in 0+1 records out 8 bytes copied dd: /mnt/cgroup.procs: cannot skip to specified offset 1386 <<< generates last line anyway 0+1 records in 0+1 records out 5 bytes copied https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index be1a1c83cdd1..9a6f060cbf51 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -471,6 +471,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) */ p++; if (p >= end) { + (*pos)++; return NULL; } else { *pos = *p; -- cgit v1.2.3 From 2d4ecb030dcc90fb725ecbfc82ce5d6c37906e0e Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 30 Jan 2020 13:34:59 +0300 Subject: cgroup: cgroup_procs_next should increase position index If seq_file .next fuction does not change position index, read after some lseek can generate unexpected output: 1) dd bs=1 skip output of each 2nd elements $ dd if=/sys/fs/cgroup/cgroup.procs bs=8 count=1 2 3 4 5 1+0 records in 1+0 records out 8 bytes copied, 0,000267297 s, 29,9 kB/s [test@localhost ~]$ dd if=/sys/fs/cgroup/cgroup.procs bs=1 count=8 2 4 <<< NB! 3 was skipped 6 <<< ... and 5 too 8 <<< ... and 7 8+0 records in 8+0 records out 8 bytes copied, 5,2123e-05 s, 153 kB/s This happen because __cgroup_procs_start() makes an extra extra cgroup_procs_next() call 2) read after lseek beyond end of file generates whole last line. 3) read after lseek into middle of last line generates expected rest of last line and unexpected whole line once again. Additionally patch removes an extra position index changes in __cgroup_procs_start() Cc: stable@vger.kernel.org https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 75f687301bbf..927f7b82e5c1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4595,6 +4595,9 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) struct kernfs_open_file *of = s->private; struct css_task_iter *it = of->priv; + if (pos) + (*pos)++; + return css_task_iter_next(it); } @@ -4610,7 +4613,7 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, * from position 0, so we can simply keep iterating on !0 *pos. */ if (!it) { - if (WARN_ON_ONCE((*pos)++)) + if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); it = kzalloc(sizeof(*it), GFP_KERNEL); @@ -4618,10 +4621,11 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, return ERR_PTR(-ENOMEM); of->priv = it; css_task_iter_start(&cgrp->self, iter_flags, it); - } else if (!(*pos)++) { + } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); - } + } else + return it->cur_task; return cgroup_procs_next(s, NULL, NULL); } -- cgit v1.2.3 From 9c974c77246460fa6a92c18554c3311c8c83c160 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 24 Jan 2020 12:40:15 +0100 Subject: cgroup: Iterate tasks that did not finish do_exit() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PF_EXITING is set earlier than actual removal from css_set when a task is exitting. This can confuse cgroup.procs readers who see no PF_EXITING tasks, however, rmdir is checking against css_set membership so it can transitionally fail with EBUSY. Fix this by listing tasks that weren't unlinked from css_set active lists. It may happen that other users of the task iterator (without CSS_TASK_ITER_PROCS) spot a PF_EXITING task before cgroup_exit(). This is equal to the state before commit c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations") but it may be reviewed later. Reported-by: Suren Baghdasaryan Fixes: c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations") Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 927f7b82e5c1..c719a4154d6d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4400,12 +4400,16 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) } } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); - if (!list_empty(&cset->tasks)) + if (!list_empty(&cset->tasks)) { it->task_pos = cset->tasks.next; - else if (!list_empty(&cset->mg_tasks)) + it->cur_tasks_head = &cset->tasks; + } else if (!list_empty(&cset->mg_tasks)) { it->task_pos = cset->mg_tasks.next; - else + it->cur_tasks_head = &cset->mg_tasks; + } else { it->task_pos = cset->dying_tasks.next; + it->cur_tasks_head = &cset->dying_tasks; + } it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; @@ -4463,10 +4467,14 @@ repeat: else it->task_pos = it->task_pos->next; - if (it->task_pos == it->tasks_head) + if (it->task_pos == it->tasks_head) { it->task_pos = it->mg_tasks_head->next; - if (it->task_pos == it->mg_tasks_head) + it->cur_tasks_head = it->mg_tasks_head; + } + if (it->task_pos == it->mg_tasks_head) { it->task_pos = it->dying_tasks_head->next; + it->cur_tasks_head = it->dying_tasks_head; + } if (it->task_pos == it->dying_tasks_head) css_task_iter_advance_css_set(it); } else { @@ -4485,11 +4493,12 @@ repeat: goto repeat; /* and dying leaders w/o live member threads */ - if (!atomic_read(&task->signal->live)) + if (it->cur_tasks_head == it->dying_tasks_head && + !atomic_read(&task->signal->live)) goto repeat; } else { /* skip all dying ones */ - if (task->flags & PF_EXITING) + if (it->cur_tasks_head == it->dying_tasks_head) goto repeat; } } -- cgit v1.2.3 From cba6437a1854fde5934098ec3bd0ee83af3129f5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 12 Feb 2020 12:19:41 +0100 Subject: genirq/proc: Reject invalid affinity masks (again) Qian Cai reported that the WARN_ON() in the x86/msi affinity setting code, which catches cases where the affinity setting is not done on the CPU which is the current target of the interrupt, triggers during CPU hotplug stress testing. It turns out that the warning which was added with the commit addressing the MSI affinity race unearthed yet another long standing bug. If user space writes a bogus affinity mask, i.e. it contains no online CPUs, then it calls irq_select_affinity_usr(). This was introduced for ALPHA in eee45269b0f5 ("[PATCH] Alpha: convert to generic irq framework (generic part)") and subsequently made available for all architectures in 18404756765c ("genirq: Expose default irq affinity mask (take 3)") which introduced the circumvention of the affinity setting restrictions for interrupt which cannot be moved in process context. The whole exercise is bogus in various aspects: 1) If the interrupt is already started up then there is absolutely no point to honour a bogus interrupt affinity setting from user space. The interrupt is already assigned to an online CPU and it does not make any sense to reassign it to some other randomly chosen online CPU. 2) If the interupt is not yet started up then there is no point either. A subsequent startup of the interrupt will invoke irq_setup_affinity() anyway which will chose a valid target CPU. So the only correct solution is to just return -EINVAL in case user space wrote an affinity mask which does not contain any online CPUs, except for ALPHA which has it's own magic sauce for this. Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)") Reported-by: Qian Cai Signed-off-by: Thomas Gleixner Tested-by: Qian Cai Link: https://lkml.kernel.org/r/878sl8xdbm.fsf@nanos.tec.linutronix.de --- kernel/irq/internals.h | 2 -- kernel/irq/manage.c | 18 ++---------------- kernel/irq/proc.c | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 3924fbe829d4..c9d8eb7f5c02 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -128,8 +128,6 @@ static inline void unregister_handler_proc(unsigned int irq, extern bool irq_can_set_affinity_usr(unsigned int irq); -extern int irq_select_affinity_usr(unsigned int irq); - extern void irq_set_thread_affinity(struct irq_desc *desc); extern int irq_do_set_affinity(struct irq_data *data, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3089a60ea8f9..7eee98c38f25 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -481,23 +481,9 @@ int irq_setup_affinity(struct irq_desc *desc) { return irq_select_affinity(irq_desc_get_irq(desc)); } -#endif +#endif /* CONFIG_AUTO_IRQ_AFFINITY */ +#endif /* CONFIG_SMP */ -/* - * Called when a bogus affinity is set via /proc/irq - */ -int irq_select_affinity_usr(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&desc->lock, flags); - ret = irq_setup_affinity(desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; -} -#endif /** * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9e5783d98033..32c071d7bc03 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -111,6 +111,28 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v) return show_irq_affinity(AFFINITY_LIST, m); } +#ifndef CONFIG_AUTO_IRQ_AFFINITY +static inline int irq_select_affinity_usr(unsigned int irq) +{ + /* + * If the interrupt is started up already then this fails. The + * interrupt is assigned to an online CPU already. There is no + * point to move it around randomly. Tell user space that the + * selected mask is bogus. + * + * If not then any change to the affinity is pointless because the + * startup code invokes irq_setup_affinity() which will select + * a online CPU anyway. + */ + return -EINVAL; +} +#else +/* ALPHA magic affinity auto selector. Keep it for historical reasons. */ +static inline int irq_select_affinity_usr(unsigned int irq) +{ + return irq_select_affinity(irq); +} +#endif static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) -- cgit v1.2.3 From e20d3a055a457a10a4c748ce5b7c2ed3173a1324 Mon Sep 17 00:00:00 2001 From: Johannes Krude Date: Wed, 12 Feb 2020 20:32:27 +0100 Subject: bpf, offload: Replace bitwise AND by logical AND in bpf_prog_offload_info_fill This if guards whether user-space wants a copy of the offload-jited bytecode and whether this bytecode exists. By erroneously doing a bitwise AND instead of a logical AND on user- and kernel-space buffer-size can lead to no data being copied to user-space especially when user-space size is a power of two and bigger then the kernel-space buffer. Fixes: fcfb126defda ("bpf: add new jited info fields in bpf_dev_offload and bpf_prog_info") Signed-off-by: Johannes Krude Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20200212193227.GA3769@phlox.h.transitiv.net --- kernel/bpf/offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 2c5dc6541ece..bd09290e3648 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -321,7 +321,7 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, ulen = info->jited_prog_len; info->jited_prog_len = aux->offload->jited_len; - if (info->jited_prog_len & ulen) { + if (info->jited_prog_len && ulen) { uinsns = u64_to_user_ptr(info->jited_prog_insns); ulen = min_t(u32, info->jited_prog_len, ulen); if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { -- cgit v1.2.3 From 140588bfed2727e9a4814211617a79401d74922f Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Fri, 14 Feb 2020 18:26:28 +0100 Subject: s390: remove obsolete ieee_emulation_warnings s390 math emulation was removed with commit 5a79859ae0f3 ("s390: remove 31 bit support"), rendering ieee_emulation_warnings useless. The code still built because it was protected by CONFIG_MATHEMU, which was no longer selectable. This patch removes the sysctl_ieee_emulation_warnings declaration and the sysctl entry declaration. Link: https://lkml.kernel.org/r/20200214172628.3598516-1-steve@sk2.org Reviewed-by: Vasily Gorbik Signed-off-by: Stephen Kitt Signed-off-by: Vasily Gorbik --- kernel/sysctl.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d396aaaf19a3..ad5b88a53c5a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -805,15 +805,6 @@ static struct ctl_table kern_table[] = { .extra2 = &maxolduid, }, #ifdef CONFIG_S390 -#ifdef CONFIG_MATHEMU - { - .procname = "ieee_emulation_warnings", - .data = &sysctl_ieee_emulation_warnings, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { .procname = "userprocess_debug", .data = &show_unhandled_signals, -- cgit v1.2.3 From 492e0d0d6f2eb4badfd2868addf9da0f651eba0e Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Tue, 18 Feb 2020 09:25:52 -0800 Subject: bpf: Do not grab the bucket spinlock by default on htab batch ops Grabbing the spinlock for every bucket even if it's empty, was causing significant perfomance cost when traversing htab maps that have only a few entries. This patch addresses the issue by checking first the bucket_cnt, if the bucket has some entries then we go and grab the spinlock and proceed with the batching. Tested with a htab of size 50K and different value of populated entries. Before: Benchmark Time(ns) CPU(ns) --------------------------------------------- BM_DumpHashMap/1 2759655 2752033 BM_DumpHashMap/10 2933722 2930825 BM_DumpHashMap/200 3171680 3170265 BM_DumpHashMap/500 3639607 3635511 BM_DumpHashMap/1000 4369008 4364981 BM_DumpHashMap/5k 11171919 11134028 BM_DumpHashMap/20k 69150080 69033496 BM_DumpHashMap/39k 190501036 190226162 After: Benchmark Time(ns) CPU(ns) --------------------------------------------- BM_DumpHashMap/1 202707 200109 BM_DumpHashMap/10 213441 210569 BM_DumpHashMap/200 478641 472350 BM_DumpHashMap/500 980061 967102 BM_DumpHashMap/1000 1863835 1839575 BM_DumpHashMap/5k 8961836 8902540 BM_DumpHashMap/20k 69761497 69322756 BM_DumpHashMap/39k 187437830 186551111 Fixes: 057996380a42 ("bpf: Add batch ops to all htab bpf map") Signed-off-by: Brian Vazquez Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200218172552.215077-1-brianvv@google.com --- kernel/bpf/hashtab.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2d182c4ee9d9..9194479a2fa7 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1259,7 +1259,8 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; - unsigned long flags; + unsigned long flags = 0; + bool locked = false; struct htab_elem *l; struct bucket *b; int ret = 0; @@ -1319,15 +1320,25 @@ again_nocopy: dst_val = values; b = &htab->buckets[batch]; head = &b->head; - raw_spin_lock_irqsave(&b->lock, flags); + /* do not grab the lock unless need it (bucket_cnt > 0). */ + if (locked) + raw_spin_lock_irqsave(&b->lock, flags); bucket_cnt = 0; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) bucket_cnt++; + if (bucket_cnt && !locked) { + locked = true; + goto again_nocopy; + } + if (bucket_cnt > (max_count - total)) { if (total == 0) ret = -ENOSPC; + /* Note that since bucket_cnt > 0 here, it is implicit + * that the locked was grabbed, so release it. + */ raw_spin_unlock_irqrestore(&b->lock, flags); rcu_read_unlock(); this_cpu_dec(bpf_prog_active); @@ -1337,6 +1348,9 @@ again_nocopy: if (bucket_cnt > bucket_size) { bucket_size = bucket_cnt; + /* Note that since bucket_cnt > 0 here, it is implicit + * that the locked was grabbed, so release it. + */ raw_spin_unlock_irqrestore(&b->lock, flags); rcu_read_unlock(); this_cpu_dec(bpf_prog_active); @@ -1346,6 +1360,10 @@ again_nocopy: goto alloc; } + /* Next block is only safe to run if you have grabbed the lock */ + if (!locked) + goto next_batch; + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { memcpy(dst_key, l->key, key_size); @@ -1380,6 +1398,8 @@ again_nocopy: } raw_spin_unlock_irqrestore(&b->lock, flags); + locked = false; +next_batch: /* If we are not copying data, we can go to next bucket and avoid * unlocking the rcu. */ -- cgit v1.2.3 From b9aff38de2cb166476988020428985c5f7412ffc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 19 Feb 2020 15:47:57 -0800 Subject: bpf: Fix a potential deadlock with bpf_map_do_batch Commit 057996380a42 ("bpf: Add batch ops to all htab bpf map") added lookup_and_delete batch operation for hash table. The current implementation has bpf_lru_push_free() inside the bucket lock, which may cause a deadlock. syzbot reports: -> #2 (&htab->buckets[i].lock#2){....}: __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159 htab_lru_map_delete_node+0xce/0x2f0 kernel/bpf/hashtab.c:593 __bpf_lru_list_shrink_inactive kernel/bpf/bpf_lru_list.c:220 [inline] __bpf_lru_list_shrink+0xf9/0x470 kernel/bpf/bpf_lru_list.c:266 bpf_lru_list_pop_free_to_local kernel/bpf/bpf_lru_list.c:340 [inline] bpf_common_lru_pop_free kernel/bpf/bpf_lru_list.c:447 [inline] bpf_lru_pop_free+0x87c/0x1670 kernel/bpf/bpf_lru_list.c:499 prealloc_lru_pop+0x2c/0xa0 kernel/bpf/hashtab.c:132 __htab_lru_percpu_map_update_elem+0x67e/0xa90 kernel/bpf/hashtab.c:1069 bpf_percpu_hash_update+0x16e/0x210 kernel/bpf/hashtab.c:1585 bpf_map_update_value.isra.0+0x2d7/0x8e0 kernel/bpf/syscall.c:181 generic_map_update_batch+0x41f/0x610 kernel/bpf/syscall.c:1319 bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348 __do_sys_bpf+0x9b7/0x41e0 kernel/bpf/syscall.c:3460 __se_sys_bpf kernel/bpf/syscall.c:3355 [inline] __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (&loc_l->lock){....}: check_prev_add kernel/locking/lockdep.c:2475 [inline] check_prevs_add kernel/locking/lockdep.c:2580 [inline] validate_chain kernel/locking/lockdep.c:2970 [inline] __lock_acquire+0x2596/0x4a00 kernel/locking/lockdep.c:3954 lock_acquire+0x190/0x410 kernel/locking/lockdep.c:4484 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159 bpf_common_lru_push_free kernel/bpf/bpf_lru_list.c:516 [inline] bpf_lru_push_free+0x250/0x5b0 kernel/bpf/bpf_lru_list.c:555 __htab_map_lookup_and_delete_batch+0x8d4/0x1540 kernel/bpf/hashtab.c:1374 htab_lru_map_lookup_and_delete_batch+0x34/0x40 kernel/bpf/hashtab.c:1491 bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348 __do_sys_bpf+0x1f7d/0x41e0 kernel/bpf/syscall.c:3456 __se_sys_bpf kernel/bpf/syscall.c:3355 [inline] __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe Possible unsafe locking scenario: CPU0 CPU2 ---- ---- lock(&htab->buckets[i].lock#2); lock(&l->lock); lock(&htab->buckets[i].lock#2); lock(&loc_l->lock); *** DEADLOCK *** To fix the issue, for htab_lru_map_lookup_and_delete_batch() in CPU0, let us do bpf_lru_push_free() out of the htab bucket lock. This can avoid the above deadlock scenario. Fixes: 057996380a42 ("bpf: Add batch ops to all htab bpf map") Reported-by: syzbot+a38ff3d9356388f2fb83@syzkaller.appspotmail.com Reported-by: syzbot+122b5421d14e68f29cd1@syzkaller.appspotmail.com Suggested-by: Hillf Danton Suggested-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Sitnicki Acked-by: Brian Vazquez Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200219234757.3544014-1-yhs@fb.com --- kernel/bpf/hashtab.c | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 9194479a2fa7..a1468e3f5af2 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -56,6 +56,7 @@ struct htab_elem { union { struct bpf_htab *htab; struct pcpu_freelist_node fnode; + struct htab_elem *batch_flink; }; }; }; @@ -126,6 +127,17 @@ free_elems: bpf_map_area_free(htab->elems); } +/* The LRU list has a lock (lru_lock). Each htab bucket has a lock + * (bucket_lock). If both locks need to be acquired together, the lock + * order is always lru_lock -> bucket_lock and this only happens in + * bpf_lru_list.c logic. For example, certain code path of + * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), + * will acquire lru_lock first followed by acquiring bucket_lock. + * + * In hashtab.c, to avoid deadlock, lock acquisition of + * bucket_lock followed by lru_lock is not allowed. In such cases, + * bucket_lock needs to be released first before acquiring lru_lock. + */ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, u32 hash) { @@ -1256,6 +1268,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size; + struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; @@ -1388,10 +1401,18 @@ again_nocopy: } if (do_delete) { hlist_nulls_del_rcu(&l->hash_node); - if (is_lru_map) - bpf_lru_push_free(&htab->lru, &l->lru_node); - else + + /* bpf_lru_push_free() will acquire lru_lock, which + * may cause deadlock. See comments in function + * prealloc_lru_pop(). Let us do bpf_lru_push_free() + * after releasing the bucket lock. + */ + if (is_lru_map) { + l->batch_flink = node_to_free; + node_to_free = l; + } else { free_htab_elem(htab, l); + } } dst_key += key_size; dst_val += value_size; @@ -1399,6 +1420,13 @@ again_nocopy: raw_spin_unlock_irqrestore(&b->lock, flags); locked = false; + + while (node_to_free) { + l = node_to_free; + node_to_free = node_to_free->batch_flink; + bpf_lru_push_free(&htab->lru, &l->lru_node); + } + next_batch: /* If we are not copying data, we can go to next bucket and avoid * unlocking the rcu. -- cgit v1.2.3 From b0c609ab2057d0953fa05e7566f0c0e8a28fa9e1 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Fri, 14 Feb 2020 15:06:21 +0100 Subject: PM / hibernate: fix typo "reserverd_size" -> "reserved_size" Fix a mistake in a variable name in a comment. Signed-off-by: Alexandre Belloni Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ddade80ad276..d82b7b88d616 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1681,7 +1681,7 @@ static unsigned long minimum_image_size(unsigned long saveable) * hibernation for allocations made while saving the image and for device * drivers, in case they need to allocate memory from their hibernation * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough - * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through + * estimate) and reserved_size divided by PAGE_SIZE (which is tunable through * /sys/power/reserved_size, respectively). To make this happen, we compute the * total number of available page frames and allocate at least * -- cgit v1.2.3 From 279eef0531928a6669230879a6eed081513ad5a3 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 14 Feb 2020 16:56:38 -0600 Subject: tracing: Make sure synth_event_trace() example always uses u64 synth_event_trace() is the varargs version of synth_event_trace_array(), which takes an array of u64, as do synth_event_add_val() et al. To not only be consistent with those, but also to address the fact that synth_event_trace() expects every arg to be of the same type since it doesn't also pass in e.g. a format string, the caller needs to make sure all args are of the same type, u64. u64 is used because it needs to accomodate the largest type available in synthetic events, which is u64. This fixes the bug reported by the kernel test robot/Rong Chen. Link: https://lore.kernel.org/lkml/20200212113444.GS12867@shao2-debian/ Link: http://lkml.kernel.org/r/894c4e955558b521210ee0642ba194a9e603354c.1581720155.git.zanussi@kernel.org Fixes: 9fe41efaca084 ("tracing: Add synth event generation test module") Reported-by: kernel test robot Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/synth_event_gen_test.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 4aefe003cb7c..6866280a9b10 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -111,11 +111,11 @@ static int __init test_gen_synth_cmd(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"hula hoops"; /* next_comm_field */ + vals[1] = (u64)(long)"hula hoops"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed"; /* my_string_field */ + vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 598; /* my_int_field */ /* Now generate a gen_synth_test event */ @@ -218,11 +218,11 @@ static int __init test_empty_synth_event(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"tiddlywinks"; /* next_comm_field */ + vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed_2.0"; /* my_string_field */ + vals[5] = (u64)(long)"thneed_2.0"; /* my_string_field */ vals[6] = 399; /* my_int_field */ /* Now trace an empty_synth_test event */ @@ -290,11 +290,11 @@ static int __init test_create_synth_event(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"tiddlywinks"; /* next_comm_field */ + vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed"; /* my_string_field */ + vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 398; /* my_int_field */ /* Now generate a create_synth_test event */ @@ -330,7 +330,7 @@ static int __init test_add_next_synth_val(void) goto out; /* next_comm_field */ - ret = synth_event_add_next_val((u64)"slinky", &trace_state); + ret = synth_event_add_next_val((u64)(long)"slinky", &trace_state); if (ret) goto out; @@ -350,7 +350,7 @@ static int __init test_add_next_synth_val(void) goto out; /* my_string_field */ - ret = synth_event_add_next_val((u64)"thneed_2.01", &trace_state); + ret = synth_event_add_next_val((u64)(long)"thneed_2.01", &trace_state); if (ret) goto out; @@ -396,12 +396,12 @@ static int __init test_add_synth_val(void) if (ret) goto out; - ret = synth_event_add_val("next_comm_field", (u64)"silly putty", + ret = synth_event_add_val("next_comm_field", (u64)(long)"silly putty", &trace_state); if (ret) goto out; - ret = synth_event_add_val("my_string_field", (u64)"thneed_9", + ret = synth_event_add_val("my_string_field", (u64)(long)"thneed_9", &trace_state); if (ret) goto out; @@ -423,13 +423,13 @@ static int __init test_trace_synth_event(void) /* Trace some bogus values just for testing */ ret = synth_event_trace(create_synth_test, 7, /* number of values */ - 444, /* next_pid_field */ - (u64)"clackers", /* next_comm_field */ - 1000000, /* ts_ns */ - 1000, /* ts_ms */ - smp_processor_id(), /* cpu */ - (u64)"Thneed", /* my_string_field */ - 999); /* my_int_field */ + (u64)444, /* next_pid_field */ + (u64)(long)"clackers", /* next_comm_field */ + (u64)1000000, /* ts_ns */ + (u64)1000, /* ts_ms */ + (u64)smp_processor_id(),/* cpu */ + (u64)(long)"Thneed", /* my_string_field */ + (u64)999); /* my_int_field */ return ret; } -- cgit v1.2.3 From 1d9d4c90194a8c3b2f7da9f4bf3f8ba2ed810656 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 14 Feb 2020 16:56:39 -0600 Subject: tracing: Make synth_event trace functions endian-correct synth_event_trace(), synth_event_trace_array() and __synth_event_add_val() write directly into the trace buffer and need to take endianness into account, like trace_event_raw_event_synth() does. Link: http://lkml.kernel.org/r/2011354355e405af9c9d28abba430d1f5ff7771a.1581720155.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 62 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 65b54d6a1422..6a380fb83864 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1891,7 +1891,25 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - state.entry->fields[n_u64] = val; + struct synth_field *field = state.event->fields[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } n_u64++; } } @@ -1943,7 +1961,26 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - state.entry->fields[n_u64] = vals[i]; + struct synth_field *field = state.event->fields[i]; + u64 val = vals[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } n_u64++; } } @@ -2062,8 +2099,25 @@ static int __synth_event_add_val(const char *field_name, u64 val, str_field = (char *)&entry->fields[field->offset]; strscpy(str_field, str_val, STR_VAR_LEN_MAX); - } else - entry->fields[field->offset] = val; + } else { + switch (field->size) { + case 1: + *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; + break; + + case 2: + *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; + break; + + case 4: + *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; + break; + + default: + trace_state->entry->fields[field->offset] = val; + break; + } + } out: return ret; } -- cgit v1.2.3 From 3843083772dc2afde790a6d7160658b00a808da1 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 14 Feb 2020 16:56:40 -0600 Subject: tracing: Check that number of vals matches number of synth event fields Commit 7276531d4036('tracing: Consolidate trace() functions') inadvertently dropped the synth_event_trace() and synth_event_trace_array() checks that verify the number of values passed in matches the number of fields in the synthetic event being traced, so add them back. Link: http://lkml.kernel.org/r/32819cac708714693669e0dfe10fe9d935e94a16.1581720155.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 6a380fb83864..45622194a34d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1878,6 +1878,11 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) return ret; } + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + va_start(args, n_vals); for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { u64 val; @@ -1914,7 +1919,7 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) } } va_end(args); - +out: __synth_event_trace_end(&state); return ret; @@ -1953,6 +1958,11 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, return ret; } + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)vals[i]; @@ -1984,7 +1994,7 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, n_u64++; } } - +out: __synth_event_trace_end(&state); return ret; -- cgit v1.2.3 From 784bd0847eda032ed2f3522f87250655a18c0190 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 14 Feb 2020 16:56:41 -0600 Subject: tracing: Fix number printing bug in print_synth_event() Fix a varargs-related bug in print_synth_event() which resulted in strange output and oopses on 32-bit x86 systems. The problem is that trace_seq_printf() expects the varargs to match the format string, but print_synth_event() was always passing u64 values regardless. This results in unspecified behavior when unpacking with va_arg() in trace_seq_printf(). Add a function that takes the size into account when calling trace_seq_printf(). Before: modprobe-1731 [003] .... 919.039758: gen_synth_test: next_pid_field=777(null)next_comm_field=hula hoops ts_ns=1000000 ts_ms=1000 cpu=3(null)my_string_field=thneed my_int_field=598(null) After: insmod-1136 [001] .... 36.634590: gen_synth_test: next_pid_field=777 next_comm_field=hula hoops ts_ns=1000000 ts_ms=1000 cpu=1 my_string_field=thneed my_int_field=598 Link: http://lkml.kernel.org/r/a9b59eb515dbbd7d4abe53b347dccf7a8e285657.1581720155.git.zanussi@kernel.org Reported-by: Steven Rostedt (VMware) Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 45622194a34d..f068d55bd37f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -820,6 +820,29 @@ static const char *synth_field_fmt(char *type) return fmt; } +static void print_synth_event_num_val(struct trace_seq *s, + char *print_fmt, char *name, + int size, u64 val, char *space) +{ + switch (size) { + case 1: + trace_seq_printf(s, print_fmt, name, (u8)val, space); + break; + + case 2: + trace_seq_printf(s, print_fmt, name, (u16)val, space); + break; + + case 4: + trace_seq_printf(s, print_fmt, name, (u32)val, space); + break; + + default: + trace_seq_printf(s, print_fmt, name, val, space); + break; + } +} + static enum print_line_t print_synth_event(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -858,10 +881,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, } else { struct trace_print_flags __flags[] = { __def_gfpflag_names, {-1, NULL} }; + char *space = (i == se->n_fields - 1 ? "" : " "); - trace_seq_printf(s, print_fmt, se->fields[i]->name, - entry->fields[n_u64], - i == se->n_fields - 1 ? "" : " "); + print_synth_event_num_val(s, print_fmt, + se->fields[i]->name, + se->fields[i]->size, + entry->fields[n_u64], + space); if (strcmp(se->fields[i]->type, "gfp_t") == 0) { trace_seq_puts(s, " ("); -- cgit v1.2.3 From 3c18a9be7c9d4f53239795282c5d927f73f534b3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Feb 2020 16:29:50 -0500 Subject: tracing: Have synthetic event test use raw_smp_processor_id() The test code that tests synthetic event creation pushes in as one of its test fields the current CPU using "smp_processor_id()". As this is just something to see if the value is correctly passed in, and the actual CPU used does not matter, use raw_smp_processor_id(), otherwise with debug preemption enabled, a warning happens as the smp_processor_id() is called without preemption enabled. Link: http://lkml.kernel.org/r/20200220162950.35162579@gandalf.local.home Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/synth_event_gen_test.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 6866280a9b10..7d56d621ffea 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -114,7 +114,7 @@ static int __init test_gen_synth_cmd(void) vals[1] = (u64)(long)"hula hoops"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ + vals[4] = raw_smp_processor_id(); /* cpu */ vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 598; /* my_int_field */ @@ -221,7 +221,7 @@ static int __init test_empty_synth_event(void) vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ + vals[4] = raw_smp_processor_id(); /* cpu */ vals[5] = (u64)(long)"thneed_2.0"; /* my_string_field */ vals[6] = 399; /* my_int_field */ @@ -293,7 +293,7 @@ static int __init test_create_synth_event(void) vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ + vals[4] = raw_smp_processor_id(); /* cpu */ vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 398; /* my_int_field */ @@ -345,7 +345,7 @@ static int __init test_add_next_synth_val(void) goto out; /* cpu */ - ret = synth_event_add_next_val(smp_processor_id(), &trace_state); + ret = synth_event_add_next_val(raw_smp_processor_id(), &trace_state); if (ret) goto out; @@ -388,7 +388,7 @@ static int __init test_add_synth_val(void) if (ret) goto out; - ret = synth_event_add_val("cpu", smp_processor_id(), &trace_state); + ret = synth_event_add_val("cpu", raw_smp_processor_id(), &trace_state); if (ret) goto out; @@ -427,7 +427,7 @@ static int __init test_trace_synth_event(void) (u64)(long)"clackers", /* next_comm_field */ (u64)1000000, /* ts_ns */ (u64)1000, /* ts_ms */ - (u64)smp_processor_id(),/* cpu */ + (u64)raw_smp_processor_id(), /* cpu */ (u64)(long)"Thneed", /* my_string_field */ (u64)999); /* my_int_field */ return ret; -- cgit v1.2.3 From 78041c0c9e935d9ce4086feeff6c569ed88ddfd4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Feb 2020 15:38:01 -0500 Subject: tracing: Disable trace_printk() on post poned tests The tracing seftests checks various aspects of the tracing infrastructure, and one is filtering. If trace_printk() is active during a self test, it can cause the filtering to fail, which will disable that part of the trace. To keep the selftests from failing because of trace_printk() calls, trace_printk() checks the variable tracing_selftest_running, and if set, it does not write to the tracing buffer. As some tracers were registered earlier in boot, the selftest they triggered would fail because not all the infrastructure was set up for the full selftest. Thus, some of the tests were post poned to when their infrastructure was ready (namely file system code). The postpone code did not set the tracing_seftest_running variable, and could fail if a trace_printk() was added and executed during their run. Cc: stable@vger.kernel.org Fixes: 9afecfbb95198 ("tracing: Postpone tracer start-up tests till the system is more robust") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 183b031a3828..a89c562ffb8f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1837,6 +1837,7 @@ static __init int init_trace_selftests(void) pr_info("Running postponed tracer tests:\n"); + tracing_selftest_running = true; list_for_each_entry_safe(p, n, &postponed_selftests, list) { /* This loop can take minutes when sanitizers are enabled, so * lets make sure we allow RCU processing. @@ -1859,6 +1860,7 @@ static __init int init_trace_selftests(void) list_del(&p->list); kfree(p); } + tracing_selftest_running = false; out: mutex_unlock(&trace_types_lock); -- cgit v1.2.3 From 7ab215f22d04067094de8c81c20ba4c565ff8dd4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 17 Feb 2020 18:52:39 +0900 Subject: tracing: Clear trace_state when starting trace Clear trace_state data structure when starting trace in __synth_event_trace_start() internal function. Currently trace_state is initialized only in the synth_event_trace_start() API, but the trace_state in synth_event_trace() and synth_event_trace_array() are on the stack without initialization. This means those APIs will see wrong parameters and wil skip closing process in __synth_event_trace_end() because trace_state->disabled may be !0. Link: http://lkml.kernel.org/r/158193315899.8868.1781259176894639952.stgit@devnote2 Reviewed-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f068d55bd37f..9d87aa1f0b79 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1824,6 +1824,8 @@ __synth_event_trace_start(struct trace_event_file *file, int entry_size, fields_size = 0; int ret = 0; + memset(trace_state, '\0', sizeof(*trace_state)); + /* * Normal event tracing doesn't get called at all unless the * ENABLED bit is set (which attaches the probe thus allowing @@ -2063,8 +2065,6 @@ int synth_event_trace_start(struct trace_event_file *file, if (!trace_state) return -EINVAL; - memset(trace_state, '\0', sizeof(*trace_state)); - ret = __synth_event_trace_start(file, trace_state); if (ret == -ENOENT) ret = 0; /* just disabled, not really an error */ -- cgit v1.2.3 From d8a953ddde5ec30a36810d0a892c3949b50849e9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Feb 2020 21:18:33 +0900 Subject: bootconfig: Set CONFIG_BOOT_CONFIG=n by default Set CONFIG_BOOT_CONFIG=n by default. This also warns user if CONFIG_BOOT_CONFIG=n but "bootconfig" is given in the kernel command line. Link: http://lkml.kernel.org/r/158220111291.26565.9036889083940367969.stgit@devnote2 Suggested-by: Steven Rostedt Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 91e885194dbc..795c3e02d3f1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -143,7 +143,8 @@ if FTRACE config BOOTTIME_TRACING bool "Boot-time Tracing support" - depends on BOOT_CONFIG && TRACING + depends on TRACING + select BOOT_CONFIG default y help Enable developer to setup ftrace subsystem via supplemental -- cgit v1.2.3 From 412c53a680a97cb1ae2c0ab60230e193bee86387 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Feb 2020 20:03:54 -0800 Subject: y2038: remove unused time32 interfaces No users remain, so kill these off before we grow new ones. Link: http://lkml.kernel.org/r/20200110154232.4104492-3-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Thomas Gleixner Cc: Deepa Dinamani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 64 ------------------------------------------------------ kernel/time/time.c | 43 ------------------------------------ 2 files changed, 107 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 95005f849c68..843dd17e6078 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -26,70 +26,6 @@ #include -static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) -{ - return (!access_ok(ctv, sizeof(*ctv)) || - __get_user(tv->tv_sec, &ctv->tv_sec) || - __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; -} - -static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv) -{ - return (!access_ok(ctv, sizeof(*ctv)) || - __put_user(tv->tv_sec, &ctv->tv_sec) || - __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; -} - -static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts) -{ - return (!access_ok(cts, sizeof(*cts)) || - __get_user(ts->tv_sec, &cts->tv_sec) || - __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts) -{ - return (!access_ok(cts, sizeof(*cts)) || - __put_user(ts->tv_sec, &cts->tv_sec) || - __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -int compat_get_timeval(struct timeval *tv, const void __user *utv) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0; - else - return __compat_get_timeval(tv, utv); -} -EXPORT_SYMBOL_GPL(compat_get_timeval); - -int compat_put_timeval(const struct timeval *tv, void __user *utv) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0; - else - return __compat_put_timeval(tv, utv); -} -EXPORT_SYMBOL_GPL(compat_put_timeval); - -int compat_get_timespec(struct timespec *ts, const void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_get_timespec(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_get_timespec); - -int compat_put_timespec(const struct timespec *ts, void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_put_timespec(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_put_timespec); - #ifdef __ARCH_WANT_SYS_SIGPROCMASK /* diff --git a/kernel/time/time.c b/kernel/time/time.c index cdd7386115ff..3985b2b32d08 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -449,49 +449,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); -/** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -struct timespec ns_to_timespec(const s64 nsec) -{ - struct timespec ts; - s32 rem; - - if (!nsec) - return (struct timespec) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; - } - ts.tv_nsec = rem; - - return ts; -} -EXPORT_SYMBOL(ns_to_timespec); - -/** - * ns_to_timeval - Convert nanoseconds to timeval - * @nsec: the nanoseconds value to be converted - * - * Returns the timeval representation of the nsec parameter. - */ -struct timeval ns_to_timeval(const s64 nsec) -{ - struct timespec ts = ns_to_timespec(nsec); - struct timeval tv; - - tv.tv_sec = ts.tv_sec; - tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; - - return tv; -} -EXPORT_SYMBOL(ns_to_timeval); - struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) { struct timespec64 ts = ns_to_timespec64(nsec); -- cgit v1.2.3 From 2ad3e17ebf94b7b7f3f64c050ff168f9915345eb Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Sat, 22 Feb 2020 20:36:47 -0500 Subject: audit: fix error handling in audit_data_to_entry() Commit 219ca39427bf ("audit: use union for audit_field values since they are mutually exclusive") combined a number of separate fields in the audit_field struct into a single union. Generally this worked just fine because they are generally mutually exclusive. Unfortunately in audit_data_to_entry() the overlap can be a problem when a specific error case is triggered that causes the error path code to attempt to cleanup an audit_field struct and the cleanup involves attempting to free a stored LSM string (the lsm_str field). Currently the code always has a non-NULL value in the audit_field.lsm_str field as the top of the for-loop transfers a value into audit_field.val (both .lsm_str and .val are part of the same union); if audit_data_to_entry() fails and the audit_field struct is specified to contain a LSM string, but the audit_field.lsm_str has not yet been properly set, the error handling code will attempt to free the bogus audit_field.lsm_str value that was set with audit_field.val at the top of the for-loop. This patch corrects this by ensuring that the audit_field.val is only set when needed (it is cleared when the audit_field struct is allocated with kcalloc()). It also corrects a few other issues to ensure that in case of error the proper error code is returned. Cc: stable@vger.kernel.org Fixes: 219ca39427bf ("audit: use union for audit_field values since they are mutually exclusive") Reported-by: syzbot+1f4d90ead370d72e450b@syzkaller.appspotmail.com Signed-off-by: Paul Moore --- kernel/auditfilter.c | 71 +++++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b0126e9c0743..026e34da4ace 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -456,6 +456,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, bufp = data->buf; for (i = 0; i < data->field_count; i++) { struct audit_field *f = &entry->rule.fields[i]; + u32 f_val; err = -EINVAL; @@ -464,12 +465,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; f->type = data->fields[i]; - f->val = data->values[i]; + f_val = data->values[i]; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { + if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) { f->type = AUDIT_LOGINUID_SET; - f->val = 0; + f_val = 0; entry->rule.pflags |= AUDIT_LOGINUID_LEGACY; } @@ -485,7 +486,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SUID: case AUDIT_FSUID: case AUDIT_OBJ_UID: - f->uid = make_kuid(current_user_ns(), f->val); + f->uid = make_kuid(current_user_ns(), f_val); if (!uid_valid(f->uid)) goto exit_free; break; @@ -494,11 +495,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SGID: case AUDIT_FSGID: case AUDIT_OBJ_GID: - f->gid = make_kgid(current_user_ns(), f->val); + f->gid = make_kgid(current_user_ns(), f_val); if (!gid_valid(f->gid)) goto exit_free; break; case AUDIT_ARCH: + f->val = f_val; entry->rule.arch_f = f; break; case AUDIT_SUBJ_USER: @@ -511,11 +513,13 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - + } + entry->rule.buflen += f_val; + f->lsm_str = str; err = security_audit_rule_init(f->type, f->op, str, (void **)&f->lsm_rule); /* Keep currently invalid fields around in case they @@ -524,68 +528,71 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, pr_warn("audit rule for LSM \'%s\' is invalid\n", str); err = 0; - } - if (err) { - kfree(str); + } else if (err) goto exit_free; - } else - f->lsm_str = str; break; case AUDIT_WATCH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - - err = audit_to_watch(&entry->rule, str, f->val, f->op); + } + err = audit_to_watch(&entry->rule, str, f_val, f->op); if (err) { kfree(str); goto exit_free; } + entry->rule.buflen += f_val; break; case AUDIT_DIR: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - + } err = audit_make_tree(&entry->rule, str, f->op); kfree(str); if (err) goto exit_free; + entry->rule.buflen += f_val; break; case AUDIT_INODE: + f->val = f_val; err = audit_to_inode(&entry->rule, f); if (err) goto exit_free; break; case AUDIT_FILTERKEY: - if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) + if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN) goto exit_free; - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; + } + entry->rule.buflen += f_val; entry->rule.filterkey = str; break; case AUDIT_EXE: - if (entry->rule.exe || f->val > PATH_MAX) + if (entry->rule.exe || f_val > PATH_MAX) goto exit_free; - str = audit_unpack_string(&bufp, &remain, f->val); + str = audit_unpack_string(&bufp, &remain, f_val); if (IS_ERR(str)) { err = PTR_ERR(str); goto exit_free; } - entry->rule.buflen += f->val; - - audit_mark = audit_alloc_mark(&entry->rule, str, f->val); + audit_mark = audit_alloc_mark(&entry->rule, str, f_val); if (IS_ERR(audit_mark)) { kfree(str); err = PTR_ERR(audit_mark); goto exit_free; } + entry->rule.buflen += f_val; entry->rule.exe = audit_mark; break; + default: + f->val = f_val; + break; } } -- cgit v1.2.3 From 756125289285f6e55a03861bf4b6257aa3d19a93 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 24 Feb 2020 16:38:57 -0500 Subject: audit: always check the netlink payload length in audit_receive_msg() This patch ensures that we always check the netlink payload length in audit_receive_msg() before we take any action on the payload itself. Cc: stable@vger.kernel.org Reported-by: syzbot+399c44bf1f43b8747403@syzkaller.appspotmail.com Reported-by: syzbot+e4b12d8d202701f08b6d@syzkaller.appspotmail.com Signed-off-by: Paul Moore --- kernel/audit.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 17b0d523afb3..9ddfe2aa6671 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1101,13 +1101,11 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature audit_log_end(ab); } -static int audit_set_feature(struct sk_buff *skb) +static int audit_set_feature(struct audit_features *uaf) { - struct audit_features *uaf; int i; BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); - uaf = nlmsg_data(nlmsg_hdr(skb)); /* if there is ever a version 2 we should handle that here */ @@ -1175,6 +1173,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; void *data; + int data_len; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; @@ -1188,6 +1187,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); + data_len = nlmsg_len(nlh); switch (msg_type) { case AUDIT_GET: { @@ -1211,7 +1211,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct audit_status s; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); + memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); if (s.mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(s.enabled); if (err < 0) @@ -1315,7 +1315,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; break; case AUDIT_SET_FEATURE: - err = audit_set_feature(skb); + if (data_len < sizeof(struct audit_features)) + return -EINVAL; + err = audit_set_feature(data); if (err) return err; break; @@ -1327,6 +1329,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) err = audit_filter(msg_type, AUDIT_FILTER_USER); if (err == 1) { /* match or error */ + char *str = data; + err = 0; if (msg_type == AUDIT_USER_TTY) { err = tty_audit_push(); @@ -1334,26 +1338,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; } audit_log_user_recv_msg(&ab, msg_type); - if (msg_type != AUDIT_USER_TTY) + if (msg_type != AUDIT_USER_TTY) { + /* ensure NULL termination */ + str[data_len - 1] = '\0'; audit_log_format(ab, " msg='%.*s'", AUDIT_MESSAGE_TEXT_MAX, - (char *)data); - else { - int size; - + str); + } else { audit_log_format(ab, " data="); - size = nlmsg_len(nlh); - if (size > 0 && - ((unsigned char *)data)[size - 1] == '\0') - size--; - audit_log_n_untrustedstring(ab, data, size); + if (data_len > 0 && str[data_len - 1] == '\0') + data_len--; + audit_log_n_untrustedstring(ab, str, data_len); } audit_log_end(ab); } break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: - if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) + if (data_len < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { audit_log_common_recv_msg(audit_context(), &ab, @@ -1365,7 +1367,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_end(ab); return -EPERM; } - err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh)); + err = audit_rule_change(msg_type, seq, data, data_len); break; case AUDIT_LIST_RULES: err = audit_list_rules_send(skb, seq); @@ -1380,7 +1382,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_MAKE_EQUIV: { void *bufp = data; u32 sizes[2]; - size_t msglen = nlmsg_len(nlh); + size_t msglen = data_len; char *old, *new; err = -EINVAL; @@ -1456,7 +1458,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); + memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); /* check if new data is valid */ if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) -- cgit v1.2.3 From c780e86dd48ef6467a1146cf7d0fe1e05a635039 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 6 Feb 2020 15:28:12 +0100 Subject: blktrace: Protect q->blk_trace with RCU KASAN is reporting that __blk_add_trace() has a use-after-free issue when accessing q->blk_trace. Indeed the switching of block tracing (and thus eventual freeing of q->blk_trace) is completely unsynchronized with the currently running tracing and thus it can happen that the blk_trace structure is being freed just while __blk_add_trace() works on it. Protect accesses to q->blk_trace by RCU during tracing and make sure we wait for the end of RCU grace period when shutting down tracing. Luckily that is rare enough event that we can afford that. Note that postponing the freeing of blk_trace to an RCU callback should better be avoided as it could have unexpected user visible side-effects as debugfs files would be still existing for a short while block tracing has been shut down. Link: https://bugzilla.kernel.org/show_bug.cgi?id=205711 CC: stable@vger.kernel.org Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Tested-by: Ming Lei Reviewed-by: Bart Van Assche Reported-by: Tristan Madani Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 114 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 0735ae8545d8..4560878f0bac 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -335,6 +335,7 @@ static void put_probe_ref(void) static void blk_trace_cleanup(struct blk_trace *bt) { + synchronize_rcu(); blk_trace_free(bt); put_probe_ref(); } @@ -629,8 +630,10 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, static int __blk_trace_startstop(struct request_queue *q, int start) { int ret; - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (bt == NULL) return -EINVAL; @@ -740,8 +743,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) void blk_trace_shutdown(struct request_queue *q) { mutex_lock(&q->blk_trace_mutex); - - if (q->blk_trace) { + if (rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex))) { __blk_trace_startstop(q, 0); __blk_trace_remove(q); } @@ -752,8 +755,10 @@ void blk_trace_shutdown(struct request_queue *q) #ifdef CONFIG_BLK_CGROUP static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + /* We don't use the 'bt' value here except as an optimization... */ + bt = rcu_dereference_protected(q->blk_trace, 1); if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return 0; @@ -796,10 +801,14 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) static void blk_add_trace_rq(struct request *rq, int error, unsigned int nr_bytes, u32 what, u64 cgid) { - struct blk_trace *bt = rq->q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(rq->q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } if (blk_rq_is_passthrough(rq)) what |= BLK_TC_ACT(BLK_TC_PC); @@ -808,6 +817,7 @@ static void blk_add_trace_rq(struct request *rq, int error, __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), rq->cmd_flags, what, error, 0, NULL, cgid); + rcu_read_unlock(); } static void blk_add_trace_rq_insert(void *ignore, @@ -853,14 +863,19 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, u32 what, int error) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, what, error, 0, NULL, blk_trace_bio_get_cgid(q, bio)); + rcu_read_unlock(); } static void blk_add_trace_bio_bounce(void *ignore, @@ -905,11 +920,14 @@ static void blk_add_trace_getrq(void *ignore, if (bio) blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); else { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, NULL, 0); + rcu_read_unlock(); } } @@ -921,27 +939,35 @@ static void blk_add_trace_sleeprq(void *ignore, if (bio) blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); else { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, 0, 0, NULL, 0); + rcu_read_unlock(); } } static void blk_add_trace_plug(void *ignore, struct request_queue *q) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); + rcu_read_unlock(); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, unsigned int depth, bool explicit) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(depth); u32 what; @@ -953,14 +979,17 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } + rcu_read_unlock(); } static void blk_add_trace_split(void *ignore, struct request_queue *q, struct bio *bio, unsigned int pdu) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(pdu); @@ -969,6 +998,7 @@ static void blk_add_trace_split(void *ignore, BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), &rpdu, blk_trace_bio_get_cgid(q, bio)); } + rcu_read_unlock(); } /** @@ -988,11 +1018,15 @@ static void blk_add_trace_bio_remap(void *ignore, struct request_queue *q, struct bio *bio, dev_t dev, sector_t from) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct blk_io_trace_remap r; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(bio_dev(bio)); @@ -1001,6 +1035,7 @@ static void blk_add_trace_bio_remap(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); + rcu_read_unlock(); } /** @@ -1021,11 +1056,15 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, sector_t from) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct blk_io_trace_remap r; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); @@ -1034,6 +1073,7 @@ static void blk_add_trace_rq_remap(void *ignore, __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), 0, BLK_TA_REMAP, 0, sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); + rcu_read_unlock(); } /** @@ -1051,14 +1091,19 @@ void blk_add_driver_data(struct request_queue *q, struct request *rq, void *data, size_t len) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, 0, len, data, blk_trace_request_get_cgid(q, rq)); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1597,6 +1642,7 @@ static int blk_trace_remove_queue(struct request_queue *q) return -EINVAL; put_probe_ref(); + synchronize_rcu(); blk_trace_free(bt); return 0; } @@ -1758,6 +1804,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct hd_struct *p = dev_to_part(dev); struct request_queue *q; struct block_device *bdev; + struct blk_trace *bt; ssize_t ret = -ENXIO; bdev = bdget(part_devt(p)); @@ -1770,21 +1817,23 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, mutex_lock(&q->blk_trace_mutex); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (attr == &dev_attr_enable) { - ret = sprintf(buf, "%u\n", !!q->blk_trace); + ret = sprintf(buf, "%u\n", !!bt); goto out_unlock_bdev; } - if (q->blk_trace == NULL) + if (bt == NULL) ret = sprintf(buf, "disabled\n"); else if (attr == &dev_attr_act_mask) - ret = blk_trace_mask2str(buf, q->blk_trace->act_mask); + ret = blk_trace_mask2str(buf, bt->act_mask); else if (attr == &dev_attr_pid) - ret = sprintf(buf, "%u\n", q->blk_trace->pid); + ret = sprintf(buf, "%u\n", bt->pid); else if (attr == &dev_attr_start_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); + ret = sprintf(buf, "%llu\n", bt->start_lba); else if (attr == &dev_attr_end_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); + ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: mutex_unlock(&q->blk_trace_mutex); @@ -1801,6 +1850,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct block_device *bdev; struct request_queue *q; struct hd_struct *p; + struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; @@ -1831,8 +1881,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, mutex_lock(&q->blk_trace_mutex); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (attr == &dev_attr_enable) { - if (!!value == !!q->blk_trace) { + if (!!value == !!bt) { ret = 0; goto out_unlock_bdev; } @@ -1844,18 +1896,18 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } ret = 0; - if (q->blk_trace == NULL) + if (bt == NULL) ret = blk_trace_setup_queue(q, bdev); if (ret == 0) { if (attr == &dev_attr_act_mask) - q->blk_trace->act_mask = value; + bt->act_mask = value; else if (attr == &dev_attr_pid) - q->blk_trace->pid = value; + bt->pid = value; else if (attr == &dev_attr_start_lba) - q->blk_trace->start_lba = value; + bt->start_lba = value; else if (attr == &dev_attr_end_lba) - q->blk_trace->end_lba = value; + bt->end_lba = value; } out_unlock_bdev: -- cgit v1.2.3 From 2910b5aa6f545c044173a5cab3dbb7f43e23916d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 25 Feb 2020 23:36:41 +0900 Subject: bootconfig: Fix CONFIG_BOOTTIME_TRACING dependency issue Since commit d8a953ddde5e ("bootconfig: Set CONFIG_BOOT_CONFIG=n by default") also changed the CONFIG_BOOTTIME_TRACING to select CONFIG_BOOT_CONFIG to show the boot-time tracing on the menu, it introduced wrong dependencies with BLK_DEV_INITRD as below. WARNING: unmet direct dependencies detected for BOOT_CONFIG Depends on [n]: BLK_DEV_INITRD [=n] Selected by [y]: - BOOTTIME_TRACING [=y] && TRACING_SUPPORT [=y] && FTRACE [=y] && TRACING [=y] This makes the CONFIG_BOOT_CONFIG selects CONFIG_BLK_DEV_INITRD to fix this error and make CONFIG_BOOTTIME_TRACING=n by default, so that both boot-time tracing and boot configuration off but those appear on the menu list. Link: http://lkml.kernel.org/r/158264140162.23842.11237423518607465535.stgit@devnote2 Fixes: d8a953ddde5e ("bootconfig: Set CONFIG_BOOT_CONFIG=n by default") Reported-by: Randy Dunlap Compiled-tested-by: Randy Dunlap Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 795c3e02d3f1..402eef84c859 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -145,7 +145,6 @@ config BOOTTIME_TRACING bool "Boot-time Tracing support" depends on TRACING select BOOT_CONFIG - default y help Enable developer to setup ftrace subsystem via supplemental kernel cmdline at boot time for debugging (tracing) driver -- cgit v1.2.3 From fda31c50292a5062332fa0343c084bd9f46604d9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 24 Feb 2020 12:47:14 -0800 Subject: signal: avoid double atomic counter increments for user accounting When queueing a signal, we increment both the users count of pending signals (for RLIMIT_SIGPENDING tracking) and we increment the refcount of the user struct itself (because we keep a reference to the user in the signal structure in order to correctly account for it when freeing). That turns out to be fairly expensive, because both of them are atomic updates, and particularly under extreme signal handling pressure on big machines, you can get a lot of cache contention on the user struct. That can then cause horrid cacheline ping-pong when you do these multiple accesses. So change the reference counting to only pin the user for the _first_ pending signal, and to unpin it when the last pending signal is dequeued. That means that when a user sees a lot of concurrent signal queuing - which is the only situation when this matters - the only atomic access needed is generally the 'sigpending' count update. This was noticed because of a particularly odd timing artifact on a dual-socket 96C/192T Cascade Lake platform: when you get into bad contention, on that machine for some reason seems to be much worse when the contention happens in the upper 32-byte half of the cacheline. As a result, the kernel test robot will-it-scale 'signal1' benchmark had an odd performance regression simply due to random alignment of the 'struct user_struct' (and pointed to a completely unrelated and apparently nonsensical commit for the regression). Avoiding the double increments (and decrements on the dequeueing side, of course) makes for much less contention and hugely improved performance on that will-it-scale microbenchmark. Quoting Feng Tang: "It makes a big difference, that the performance score is tripled! bump from original 17000 to 54000. Also the gap between 5.0-rc6 and 5.0-rc6+Jiri's patch is reduced to around 2%" [ The "2% gap" is the odd cacheline placement difference on that platform: under the extreme contention case, the effect of which half of the cacheline was hot was 5%, so with the reduced contention the odd timing artifact is reduced too ] It does help in the non-contended case too, but is not nearly as noticeable. Reported-and-tested-by: Feng Tang Cc: Eric W. Biederman Cc: Huang, Ying Cc: Philip Li Cc: Andi Kleen Cc: Jiri Olsa Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/signal.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 9ad8dea93dbb..5b2396350dd1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -413,27 +413,32 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi { struct sigqueue *q = NULL; struct user_struct *user; + int sigpending; /* * Protect access to @t credentials. This can go away when all * callers hold rcu read lock. + * + * NOTE! A pending signal will hold on to the user refcount, + * and we get/put the refcount only when the sigpending count + * changes from/to zero. */ rcu_read_lock(); - user = get_uid(__task_cred(t)->user); - atomic_inc(&user->sigpending); + user = __task_cred(t)->user; + sigpending = atomic_inc_return(&user->sigpending); + if (sigpending == 1) + get_uid(user); rcu_read_unlock(); - if (override_rlimit || - atomic_read(&user->sigpending) <= - task_rlimit(t, RLIMIT_SIGPENDING)) { + if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { q = kmem_cache_alloc(sigqueue_cachep, flags); } else { print_dropped_signal(sig); } if (unlikely(q == NULL)) { - atomic_dec(&user->sigpending); - free_uid(user); + if (atomic_dec_and_test(&user->sigpending)) + free_uid(user); } else { INIT_LIST_HEAD(&q->list); q->flags = 0; @@ -447,8 +452,8 @@ static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) return; - atomic_dec(&q->user->sigpending); - free_uid(q->user); + if (atomic_dec_and_test(&q->user->sigpending)) + free_uid(q->user); kmem_cache_free(sigqueue_cachep, q); } -- cgit v1.2.3 From 289de35984815576793f579ec27248609e75976e Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 18 Feb 2020 15:45:34 +0100 Subject: sched/fair: Fix statistics for find_idlest_group() sgs->group_weight is not set while gathering statistics in update_sg_wakeup_stats(). This means that a group can be classified as fully busy with 0 running tasks if utilization is high enough. This path is mainly used for fork and exec. Fixes: 57abff067a08 ("sched/fair: Rework find_idlest_group()") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20200218144534.4564-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c8a379c357e..c1217bfe5e81 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8337,6 +8337,8 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_capacity = group->sgc->capacity; + sgs->group_weight = group->group_weight; + sgs->group_type = group_classify(sd->imbalance_pct, group, sgs); /* -- cgit v1.2.3 From 0c282b068eb26db0e85e2ab4ec6d1e932acda841 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Mon, 27 Jan 2020 23:28:21 +0530 Subject: fork: Use RCU_INIT_POINTER() instead of rcu_access_pointer() Use RCU_INIT_POINTER() instead of rcu_access_pointer() in copy_sighand(). Suggested-by: Oleg Nesterov Signed-off-by: Madhuparna Bhowmik Acked-by: Oleg Nesterov Acked-by: Christian Brauner [christian.brauner@ubuntu.com: edit commit message] Link: https://lore.kernel.org/r/20200127175821.10833-1-madhuparnabhowmik10@gmail.com Signed-off-by: Christian Brauner --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 60a1295f4384..86425305cd4a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1508,7 +1508,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) return 0; } sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); - rcu_assign_pointer(tsk->sighand, sig); + RCU_INIT_POINTER(tsk->sighand, sig); if (!sig) return -ENOMEM; -- cgit v1.2.3 From 22a34c6fe0ffc1d92ee26a25913fadf347258fd6 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Thu, 30 Jan 2020 11:50:28 +0530 Subject: exit: Fix Sparse errors and warnings This patch fixes the following sparse error: kernel/exit.c:627:25: error: incompatible types in comparison expression And the following warning: kernel/exit.c:626:40: warning: incorrect type in assignment Signed-off-by: Madhuparna Bhowmik Acked-by: Oleg Nesterov Acked-by: Christian Brauner [christian.brauner@ubuntu.com: edit commit message] Link: https://lore.kernel.org/r/20200130062028.4870-1-madhuparnabhowmik10@gmail.com Signed-off-by: Christian Brauner --- kernel/exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 2833ffb0c211..0b81b26a872a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -619,8 +619,8 @@ static void forget_original_parent(struct task_struct *father, reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { - t->real_parent = reaper; - BUG_ON((!t->ptrace) != (t->parent == father)); + RCU_INIT_POINTER(t->real_parent, reaper); + BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); if (likely(!t->ptrace)) t->parent = t->real_parent; if (t->pdeath_signal) -- cgit v1.2.3 From ecc421e05bab97cf3ff4fe456ade47ef84dba8c2 Mon Sep 17 00:00:00 2001 From: Cyril Hrubis Date: Tue, 3 Mar 2020 16:06:38 +0100 Subject: sys/sysinfo: Respect boottime inside time namespace The sysinfo() syscall includes uptime in seconds but has no correction for time namespaces which makes it inconsistent with the /proc/uptime inside of a time namespace. Add the missing time namespace adjustment call. Signed-off-by: Cyril Hrubis Signed-off-by: Thomas Gleixner Reviewed-by: Dmitry Safonov Link: https://lkml.kernel.org/r/20200303150638.7329-1-chrubis@suse.cz --- kernel/sys.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index f9bc5c303e3f..d325f3ab624a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -2546,6 +2547,7 @@ static int do_sysinfo(struct sysinfo *info) memset(info, 0, sizeof(struct sysinfo)); ktime_get_boottime_ts64(&tp); + timens_add_boottime(&tp); info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); -- cgit v1.2.3 From 190ecb190a9cd8c0599d8499b901e3c32e87966a Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Sun, 23 Feb 2020 22:00:07 -0500 Subject: cgroup: fix psi_show() crash on 32bit ino archs Similar to the commit d7495343228f ("cgroup: fix incorrect WARN_ON_ONCE() in cgroup_setup_root()"), cgroup_id(root_cgrp) does not equal to 1 on 32bit ino archs which triggers all sorts of issues with psi_show() on s390x. For example, BUG: KASAN: slab-out-of-bounds in collect_percpu_times+0x2d0/ Read of size 4 at addr 000000001e0ce000 by task read_all/3667 collect_percpu_times+0x2d0/0x798 psi_show+0x7c/0x2a8 seq_read+0x2ac/0x830 vfs_read+0x92/0x150 ksys_read+0xe2/0x188 system_call+0xd8/0x2b4 Fix it by using cgroup_ino(). Fixes: 743210386c03 ("cgroup: use cgrp->kn->id as the cgroup ID") Signed-off-by: Qian Cai Acked-by: Johannes Weiner Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v5.5 --- kernel/cgroup/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c719a4154d6d..7a39dc882095 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3542,21 +3542,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_CPU); } -- cgit v1.2.3 From 2e5383d7904e60529136727e49629a82058a5607 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Wed, 19 Feb 2020 12:01:29 -0700 Subject: cgroup1: don't call release_agent when it is "" Older (and maybe current) versions of systemd set release_agent to "" when shutting down, but do not set notify_on_release to 0. Since 64e90a8acb85 ("Introduce STATIC_USERMODEHELPER to mediate call_usermodehelper()"), we filter out such calls when the user mode helper path is "". However, when used in conjunction with an actual (i.e. non "") STATIC_USERMODEHELPER, the path is never "", so the real usermode helper will be called with argv[0] == "". Let's avoid this by not invoking the release_agent when it is "". Signed-off-by: Tycho Andersen Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 9a6f060cbf51..f2d7cea86ffe 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -783,7 +783,7 @@ void cgroup1_release_agent(struct work_struct *work) pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!pathbuf || !agentbuf) + if (!pathbuf || !agentbuf || !strlen(agentbuf)) goto out; spin_lock_irq(&css_set_lock); -- cgit v1.2.3 From 153031a301bb07194e9c37466cfce8eacb977621 Mon Sep 17 00:00:00 2001 From: Cengiz Can Date: Wed, 4 Mar 2020 13:58:19 +0300 Subject: blktrace: fix dereference after null check There was a recent change in blktrace.c that added a RCU protection to `q->blk_trace` in order to fix a use-after-free issue during access. However the change missed an edge case that can lead to dereferencing of `bt` pointer even when it's NULL: Coverity static analyzer marked this as a FORWARD_NULL issue with CID 1460458. ``` /kernel/trace/blktrace.c: 1904 in sysfs_blk_trace_attr_store() 1898 ret = 0; 1899 if (bt == NULL) 1900 ret = blk_trace_setup_queue(q, bdev); 1901 1902 if (ret == 0) { 1903 if (attr == &dev_attr_act_mask) >>> CID 1460458: Null pointer dereferences (FORWARD_NULL) >>> Dereferencing null pointer "bt". 1904 bt->act_mask = value; 1905 else if (attr == &dev_attr_pid) 1906 bt->pid = value; 1907 else if (attr == &dev_attr_start_lba) 1908 bt->start_lba = value; 1909 else if (attr == &dev_attr_end_lba) ``` Added a reassignment with RCU annotation to fix the issue. Fixes: c780e86dd48 ("blktrace: Protect q->blk_trace with RCU") Cc: stable@vger.kernel.org Reviewed-by: Ming Lei Reviewed-by: Bob Liu Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Cengiz Can Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 4560878f0bac..ca39dc3230cb 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1896,8 +1896,11 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } ret = 0; - if (bt == NULL) + if (bt == NULL) { ret = blk_trace_setup_queue(q, bdev); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); + } if (ret == 0) { if (attr == &dev_attr_act_mask) -- cgit v1.2.3 From 1bc7896e9ef44fd77858b3ef0b8a6840be3a4494 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 4 Mar 2020 11:11:04 -0800 Subject: bpf: Fix deadlock with rq_lock in bpf_send_signal() When experimenting with bpf_send_signal() helper in our production environment (5.2 based), we experienced a deadlock in NMI mode: #5 [ffffc9002219f770] queued_spin_lock_slowpath at ffffffff8110be24 #6 [ffffc9002219f770] _raw_spin_lock_irqsave at ffffffff81a43012 #7 [ffffc9002219f780] try_to_wake_up at ffffffff810e7ecd #8 [ffffc9002219f7e0] signal_wake_up_state at ffffffff810c7b55 #9 [ffffc9002219f7f0] __send_signal at ffffffff810c8602 #10 [ffffc9002219f830] do_send_sig_info at ffffffff810ca31a #11 [ffffc9002219f868] bpf_send_signal at ffffffff8119d227 #12 [ffffc9002219f988] bpf_overflow_handler at ffffffff811d4140 #13 [ffffc9002219f9e0] __perf_event_overflow at ffffffff811d68cf #14 [ffffc9002219fa10] perf_swevent_overflow at ffffffff811d6a09 #15 [ffffc9002219fa38] ___perf_sw_event at ffffffff811e0f47 #16 [ffffc9002219fc30] __schedule at ffffffff81a3e04d #17 [ffffc9002219fc90] schedule at ffffffff81a3e219 #18 [ffffc9002219fca0] futex_wait_queue_me at ffffffff8113d1b9 #19 [ffffc9002219fcd8] futex_wait at ffffffff8113e529 #20 [ffffc9002219fdf0] do_futex at ffffffff8113ffbc #21 [ffffc9002219fec0] __x64_sys_futex at ffffffff81140d1c #22 [ffffc9002219ff38] do_syscall_64 at ffffffff81002602 #23 [ffffc9002219ff50] entry_SYSCALL_64_after_hwframe at ffffffff81c00068 The above call stack is actually very similar to an issue reported by Commit eac9153f2b58 ("bpf/stackmap: Fix deadlock with rq_lock in bpf_get_stack()") by Song Liu. The only difference is bpf_send_signal() helper instead of bpf_get_stack() helper. The above deadlock is triggered with a perf_sw_event. Similar to Commit eac9153f2b58, the below almost identical reproducer used tracepoint point sched/sched_switch so the issue can be easily caught. /* stress_test.c */ #include #include #include #include #include #include #include #define THREAD_COUNT 1000 char *filename; void *worker(void *p) { void *ptr; int fd; char *pptr; fd = open(filename, O_RDONLY); if (fd < 0) return NULL; while (1) { struct timespec ts = {0, 1000 + rand() % 2000}; ptr = mmap(NULL, 4096 * 64, PROT_READ, MAP_PRIVATE, fd, 0); usleep(1); if (ptr == MAP_FAILED) { printf("failed to mmap\n"); break; } munmap(ptr, 4096 * 64); usleep(1); pptr = malloc(1); usleep(1); pptr[0] = 1; usleep(1); free(pptr); usleep(1); nanosleep(&ts, NULL); } close(fd); return NULL; } int main(int argc, char *argv[]) { void *ptr; int i; pthread_t threads[THREAD_COUNT]; if (argc < 2) return 0; filename = argv[1]; for (i = 0; i < THREAD_COUNT; i++) { if (pthread_create(threads + i, NULL, worker, NULL)) { fprintf(stderr, "Error creating thread\n"); return 0; } } for (i = 0; i < THREAD_COUNT; i++) pthread_join(threads[i], NULL); return 0; } and the following command: 1. run `stress_test /bin/ls` in one windown 2. hack bcc trace.py with the following change: --- a/tools/trace.py +++ b/tools/trace.py @@ -513,6 +513,7 @@ BPF_PERF_OUTPUT(%s); __data.tgid = __tgid; __data.pid = __pid; bpf_get_current_comm(&__data.comm, sizeof(__data.comm)); + bpf_send_signal(10); %s %s %s.perf_submit(%s, &__data, sizeof(__data)); 3. in a different window run ./trace.py -p $(pidof stress_test) t:sched:sched_switch The deadlock can be reproduced in our production system. Similar to Song's fix, the fix is to delay sending signal if irqs is disabled to avoid deadlocks involving with rq_lock. With this change, my above stress-test in our production system won't cause deadlock any more. I also implemented a scale-down version of reproducer in the selftest (a subsequent commit). With latest bpf-next, it complains for the following potential deadlock. [ 32.832450] -> #1 (&p->pi_lock){-.-.}: [ 32.833100] _raw_spin_lock_irqsave+0x44/0x80 [ 32.833696] task_rq_lock+0x2c/0xa0 [ 32.834182] task_sched_runtime+0x59/0xd0 [ 32.834721] thread_group_cputime+0x250/0x270 [ 32.835304] thread_group_cputime_adjusted+0x2e/0x70 [ 32.835959] do_task_stat+0x8a7/0xb80 [ 32.836461] proc_single_show+0x51/0xb0 ... [ 32.839512] -> #0 (&(&sighand->siglock)->rlock){....}: [ 32.840275] __lock_acquire+0x1358/0x1a20 [ 32.840826] lock_acquire+0xc7/0x1d0 [ 32.841309] _raw_spin_lock_irqsave+0x44/0x80 [ 32.841916] __lock_task_sighand+0x79/0x160 [ 32.842465] do_send_sig_info+0x35/0x90 [ 32.842977] bpf_send_signal+0xa/0x10 [ 32.843464] bpf_prog_bc13ed9e4d3163e3_send_signal_tp_sched+0x465/0x1000 [ 32.844301] trace_call_bpf+0x115/0x270 [ 32.844809] perf_trace_run_bpf_submit+0x4a/0xc0 [ 32.845411] perf_trace_sched_switch+0x10f/0x180 [ 32.846014] __schedule+0x45d/0x880 [ 32.846483] schedule+0x5f/0xd0 ... [ 32.853148] Chain exists of: [ 32.853148] &(&sighand->siglock)->rlock --> &p->pi_lock --> &rq->lock [ 32.853148] [ 32.854451] Possible unsafe locking scenario: [ 32.854451] [ 32.855173] CPU0 CPU1 [ 32.855745] ---- ---- [ 32.856278] lock(&rq->lock); [ 32.856671] lock(&p->pi_lock); [ 32.857332] lock(&rq->lock); [ 32.857999] lock(&(&sighand->siglock)->rlock); Deadlock happens on CPU0 when it tries to acquire &sighand->siglock but it has been held by CPU1 and CPU1 tries to grab &rq->lock and cannot get it. This is not exactly the callstack in our production environment, but sympotom is similar and both locks are using spin_lock_irqsave() to acquire the lock, and both involves rq_lock. The fix to delay sending signal when irq is disabled also fixed this issue. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Cc: Song Liu Link: https://lore.kernel.org/bpf/20200304191104.2796501-1-yhs@fb.com --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 19e793aa441a..68250d433bd7 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -732,7 +732,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) if (unlikely(!nmi_uaccess_okay())) return -EPERM; - if (in_nmi()) { + if (irqs_disabled()) { /* Do an early check on signal validity. Otherwise, * the error is lost in deferred irq_work. */ -- cgit v1.2.3 From 8e5290e710f4ffe8e9f8813e2ed0397a94d7b2f1 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 4 Mar 2020 17:34:47 -0800 Subject: bpf: Return better error value in delete_elem for struct_ops map The current always succeed behavior in bpf_struct_ops_map_delete_elem() is not ideal for userspace tool. It can be improved to return proper error value. If it is in TOBEFREE, it means unregistration has been already done before but it is in progress and waiting for the subsystem to clear the refcnt to zero, so -EINPROGRESS. If it is INIT, it means the struct_ops has not been registered yet, so -ENOENT. Fixes: 85d33df357b6 ("bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS") Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200305013447.535326-1-kafai@fb.com --- kernel/bpf/bpf_struct_ops.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 042f95534f86..68a89a9f7ccd 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -482,13 +482,21 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) prev_state = cmpxchg(&st_map->kvalue.state, BPF_STRUCT_OPS_STATE_INUSE, BPF_STRUCT_OPS_STATE_TOBEFREE); - if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) { + switch (prev_state) { + case BPF_STRUCT_OPS_STATE_INUSE: st_map->st_ops->unreg(&st_map->kvalue.data); if (refcount_dec_and_test(&st_map->kvalue.refcnt)) bpf_map_put(map); + return 0; + case BPF_STRUCT_OPS_STATE_TOBEFREE: + return -EINPROGRESS; + case BPF_STRUCT_OPS_STATE_INIT: + return -ENOENT; + default: + WARN_ON_ONCE(1); + /* Should never happen. Treat it as not found. */ + return -ENOENT; } - - return 0; } static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, -- cgit v1.2.3 From 849b4d94582a966ecb533448415462846da1f0fa Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 4 Mar 2020 17:34:54 -0800 Subject: bpf: Do not allow map_freeze in struct_ops map struct_ops map cannot support map_freeze. Otherwise, a struct_ops cannot be unregistered from the subsystem. Fixes: 85d33df357b6 ("bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS") Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200305013454.535397-1-kafai@fb.com --- kernel/bpf/syscall.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a91ad518c050..0c7fb0d4836d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1510,6 +1510,11 @@ static int map_freeze(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + fdput(f); + return -ENOTSUPP; + } + mutex_lock(&map->freeze_mutex); if (map->writecnt) { -- cgit v1.2.3 From 8019ad13ef7f64be44d4f892af9c840179009254 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Mar 2020 11:28:31 +0100 Subject: futex: Fix inode life-time issue As reported by Jann, ihold() does not in fact guarantee inode persistence. And instead of making it so, replace the usage of inode pointers with a per boot, machine wide, unique inode identifier. This sequence number is global, but shared (file backed) futexes are rare enough that this should not become a performance issue. Reported-by: Jann Horn Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) --- kernel/futex.c | 89 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 53 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 0cf84c8664f2..e14f7cd45dbd 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -429,7 +429,7 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); /* implies smp_mb(); (B) */ + smp_mb(); /* explicit smp_mb(); (B) */ break; case FUT_OFF_MMSHARED: futex_get_mm(key); /* implies smp_mb(); (B) */ @@ -463,7 +463,6 @@ static void drop_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - iput(key->shared.inode); break; case FUT_OFF_MMSHARED: mmdrop(key->private.mm); @@ -505,6 +504,46 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, return timeout; } +/* + * Generate a machine wide unique identifier for this inode. + * + * This relies on u64 not wrapping in the life-time of the machine; which with + * 1ns resolution means almost 585 years. + * + * This further relies on the fact that a well formed program will not unmap + * the file while it has a (shared) futex waiting on it. This mapping will have + * a file reference which pins the mount and inode. + * + * If for some reason an inode gets evicted and read back in again, it will get + * a new sequence number and will _NOT_ match, even though it is the exact same + * file. + * + * It is important that match_futex() will never have a false-positive, esp. + * for PI futexes that can mess up the state. The above argues that false-negatives + * are only possible for malformed programs. + */ +static u64 get_inode_sequence_number(struct inode *inode) +{ + static atomic64_t i_seq; + u64 old; + + /* Does the inode already have a sequence number? */ + old = atomic64_read(&inode->i_sequence); + if (likely(old)) + return old; + + for (;;) { + u64 new = atomic64_add_return(1, &i_seq); + if (WARN_ON_ONCE(!new)) + continue; + + old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); + if (old) + return old; + return new; + } +} + /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex @@ -517,9 +556,15 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, * * The key words are stored in @key on success. * - * For shared mappings, it's (page->index, file_inode(vma->vm_file), - * offset_within_page). For private mappings, it's (uaddr, current->mm). - * We can usually work out the index without swapping in the page. + * For shared mappings (when @fshared), the key is: + * ( inode->i_sequence, page->index, offset_within_page ) + * [ also see get_inode_sequence_number() ] + * + * For private mappings (or when !@fshared), the key is: + * ( current->mm, address, 0 ) + * + * This allows (cross process, where applicable) identification of the futex + * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ @@ -659,8 +704,6 @@ again: key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies smp_mb(); (B) */ - } else { struct inode *inode; @@ -692,40 +735,14 @@ again: goto again; } - /* - * Take a reference unless it is about to be freed. Previously - * this reference was taken by ihold under the page lock - * pinning the inode in place so i_lock was unnecessary. The - * only way for this check to fail is if the inode was - * truncated in parallel which is almost certainly an - * application bug. In such a case, just retry. - * - * We are not calling into get_futex_key_refs() in file-backed - * cases, therefore a successful atomic_inc return below will - * guarantee that get_futex_key() will still imply smp_mb(); (B). - */ - if (!atomic_inc_not_zero(&inode->i_count)) { - rcu_read_unlock(); - put_page(page); - - goto again; - } - - /* Should be impossible but lets be paranoid for now */ - if (WARN_ON_ONCE(inode->i_mapping != mapping)) { - err = -EFAULT; - rcu_read_unlock(); - iput(inode); - - goto out; - } - key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = inode; + key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = basepage_index(tail); rcu_read_unlock(); } + get_futex_key_refs(key); /* implies smp_mb(); (B) */ + out: put_page(page); return err; -- cgit v1.2.3 From b26ebfe12f34f372cf041c6f801fa49c3fb382c5 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Fri, 6 Mar 2020 11:23:14 -0600 Subject: pid: Fix error return value in some cases Recent changes to alloc_pid() allow the pid number to be specified on the command line. If set_tid_size is set, then the code scanning the levels will hard-set retval to -EPERM, overriding it's previous -ENOMEM value. After the code scanning the levels, there are error returns that do not set retval, assuming it is still set to -ENOMEM. So set retval back to -ENOMEM after scanning the levels. Fixes: 49cb2fc42ce4 ("fork: extend clone3() to support setting a PID") Signed-off-by: Corey Minyard Acked-by: Christian Brauner Cc: Andrei Vagin Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Oleg Nesterov Cc: Adrian Reber Cc: # 5.5 Link: https://lore.kernel.org/r/20200306172314.12232-1-minyard@acm.org [christian.brauner@ubuntu.com: fixup commit message] Signed-off-by: Christian Brauner --- kernel/pid.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 0f4ecb57214c..19645b25b77c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -247,6 +247,8 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, tmp = tmp->parent; } + retval = -ENOMEM; + if (unlikely(is_child_reaper(pid))) { if (pid_ns_prepare_proc(ns)) goto out_free; -- cgit v1.2.3 From 8d67743653dce5a0e7aa500fcccb237cde7ad88e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 8 Mar 2020 19:07:17 +0100 Subject: futex: Unbreak futex hashing The recent futex inode life time fix changed the ordering of the futex key union struct members, but forgot to adjust the hash function accordingly, As a result the hashing omits the leading 64bit and even hashes beyond the futex key causing a bad hash distribution which led to a ~100% performance regression. Hand in the futex key pointer instead of a random struct member and make the size calculation based of the struct offset. Fixes: 8019ad13ef7f ("futex: Fix inode life-time issue") Reported-by: Rong Chen Decoded-by: Linus Torvalds Signed-off-by: Thomas Gleixner Tested-by: Rong Chen Link: https://lkml.kernel.org/r/87h7yy90ve.fsf@nanos.tec.linutronix.de --- kernel/futex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e14f7cd45dbd..82dfacb3250e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -385,9 +385,9 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb) */ static struct futex_hash_bucket *hash_futex(union futex_key *key) { - u32 hash = jhash2((u32*)&key->both.word, - (sizeof(key->both.word)+sizeof(key->both.ptr))/4, + u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); + return &futex_queues[hash & (futex_hashsize - 1)]; } -- cgit v1.2.3 From 10dab84caf400f2f5f8b010ebb0c7c4272ec5093 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 8 Mar 2020 14:29:17 +0100 Subject: pid: make ENOMEM return value more obvious The alloc_pid() codepath used to be simpler. With the introducation of the ability to choose specific pids in 49cb2fc42ce4 ("fork: extend clone3() to support setting a PID") it got more complex. It hasn't been super obvious that ENOMEM is returned when the pid namespace init process/child subreaper of the pid namespace has died. As can be seen from multiple attempts to improve this see e.g. [1] and most recently [2]. We regressed returning ENOMEM in [3] and [2] restored it. Let's add a comment on top explaining that this is historic and documented behavior and cannot easily be changed. [1]: 35f71bc0a09a ("fork: report pid reservation failure properly") [2]: b26ebfe12f34 ("pid: Fix error return value in some cases") [3]: 49cb2fc42ce4 ("fork: extend clone3() to support setting a PID") Signed-off-by: Christian Brauner --- kernel/pid.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 19645b25b77c..647b4bb457b5 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -247,6 +247,14 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, tmp = tmp->parent; } + /* + * ENOMEM is not the most obvious choice especially for the case + * where the child subreaper has already exited and the pid + * namespace denies the creation of any new processes. But ENOMEM + * is what we have exposed to userspace for a long time and it is + * documented behavior for pid namespaces. So we can't easily + * change it even if there were an error code better suited. + */ retval = -ENOMEM; if (unlikely(is_child_reaper(pid))) { -- cgit v1.2.3 From 62039c30c19dcab96621e074aeeb90da7100def7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Mar 2020 15:27:55 -0700 Subject: bpf: Initialize storage pointers to NULL to prevent freeing garbage pointer Local storage array isn't initialized, so if cgroup storage allocation fails for BPF_CGROUP_STORAGE_SHARED, error handling code will attempt to free uninitialized pointer for BPF_CGROUP_STORAGE_PERCPU storage type. Avoid this by always initializing storage pointers to NULLs. Fixes: 8bad74f9840f ("bpf: extend cgroup bpf core to allow multiple cgroup storage types") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200309222756.1018737-1-andriin@fb.com --- kernel/bpf/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9a500fadbef5..b2bc4c335bb1 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -302,8 +302,8 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], - *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_prog_list *pl, *replace_pl = NULL; enum bpf_cgroup_storage_type stype; int err; -- cgit v1.2.3 From 1d8006abaab4cb90f81add86e8d1bf9411add05a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Mar 2020 15:40:17 -0700 Subject: bpf: Fix cgroup ref leak in cgroup_bpf_inherit on out-of-memory There is no compensating cgroup_bpf_put() for each ancestor cgroup in cgroup_bpf_inherit(). If compute_effective_progs returns error, those cgroups won't be freed ever. Fix it by putting them in cleanup code path. Fixes: e10360f815ca ("bpf: cgroup: prevent out-of-order release of cgroup bpf") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Roman Gushchin Link: https://lore.kernel.org/bpf/20200309224017.1063297-1-andriin@fb.com --- kernel/bpf/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b2bc4c335bb1..4f1472409ef8 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -227,6 +227,9 @@ cleanup: for (i = 0; i < NR; i++) bpf_prog_array_free(arrays[i]); + for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) + cgroup_bpf_put(p); + percpu_ref_exit(&cgrp->bpf.refcnt); return -ENOMEM; -- cgit v1.2.3 From aa202f1f56960c60e7befaa0f49c72b8fa11b0a8 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Fri, 24 Jan 2020 20:14:45 -0500 Subject: workqueue: don't use wq_select_unbound_cpu() for bound works wq_select_unbound_cpu() is designed for unbound workqueues only, but it's wrongly called when using a bound workqueue too. Fixing this ensures work queued to a bound workqueue with cpu=WORK_CPU_UNBOUND always runs on the local CPU. Before, that would happen only if wq_unbound_cpumask happened to include it (likely almost always the case), or was empty, or we got lucky with forced round-robin placement. So restricting /sys/devices/virtual/workqueue/cpumask to a small subset of a machine's CPUs would cause some bound work items to run unexpectedly there. Fixes: ef557180447f ("workqueue: schedule WORK_CPU_UNBOUND work on wq_unbound_cpumask CPUs") Cc: stable@vger.kernel.org # v4.5+ Signed-off-by: Hillf Danton [dj: massage changelog] Signed-off-by: Daniel Jordan Cc: Tejun Heo Cc: Lai Jiangshan Cc: linux-kernel@vger.kernel.org Signed-off-by: Tejun Heo --- kernel/workqueue.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 301db4406bc3..4e01c448b4b4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1411,14 +1411,16 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, return; rcu_read_lock(); retry: - if (req_cpu == WORK_CPU_UNBOUND) - cpu = wq_select_unbound_cpu(raw_smp_processor_id()); - /* pwq which will be used unless @work is executing elsewhere */ - if (!(wq->flags & WQ_UNBOUND)) - pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); - else + if (wq->flags & WQ_UNBOUND) { + if (req_cpu == WORK_CPU_UNBOUND) + cpu = wq_select_unbound_cpu(raw_smp_processor_id()); pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); + } else { + if (req_cpu == WORK_CPU_UNBOUND) + cpu = raw_smp_processor_id(); + pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); + } /* * If @work was previously on a different pool, it might still be -- cgit v1.2.3 From da6c7faeb103c493e505e87643272f70be586635 Mon Sep 17 00:00:00 2001 From: Yoshiki Komachi Date: Tue, 10 Mar 2020 16:32:29 +0900 Subject: bpf/btf: Fix BTF verification of enum members in struct/union btf_enum_check_member() was currently sure to recognize the size of "enum" type members in struct/union as the size of "int" even if its size was packed. This patch fixes BTF enum verification to use the correct size of member in BPF programs. Fixes: 179cde8cef7e ("bpf: btf: Check members of struct/union") Signed-off-by: Yoshiki Komachi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1583825550-18606-2-git-send-email-komachi.yoshiki@gmail.com --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 787140095e58..32ab9225026e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2418,7 +2418,7 @@ static int btf_enum_check_member(struct btf_verifier_env *env, struct_size = struct_type->size; bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); - if (struct_size - bytes_offset < sizeof(int)) { + if (struct_size - bytes_offset < member_type->size) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; -- cgit v1.2.3 From e876ecc67db80dfdb8e237f71e5b43bb88ae549c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 9 Mar 2020 22:16:05 -0700 Subject: cgroup: memcg: net: do not associate sock with unrelated cgroup We are testing network memory accounting in our setup and noticed inconsistent network memory usage and often unrelated cgroups network usage correlates with testing workload. On further inspection, it seems like mem_cgroup_sk_alloc() and cgroup_sk_alloc() are broken in irq context specially for cgroup v1. mem_cgroup_sk_alloc() and cgroup_sk_alloc() can be called in irq context and kind of assumes that this can only happen from sk_clone_lock() and the source sock object has already associated cgroup. However in cgroup v1, where network memory accounting is opt-in, the source sock can be unassociated with any cgroup and the new cloned sock can get associated with unrelated interrupted cgroup. Cgroup v2 can also suffer if the source sock object was created by process in the root cgroup or if sk_alloc() is called in irq context. The fix is to just do nothing in interrupt. WARNING: Please note that about half of the TCP sockets are allocated from the IRQ context, so, memory used by such sockets will not be accouted by the memcg. The stack trace of mem_cgroup_sk_alloc() from IRQ-context: CPU: 70 PID: 12720 Comm: ssh Tainted: 5.6.0-smp-DEV #1 Hardware name: ... Call Trace: dump_stack+0x57/0x75 mem_cgroup_sk_alloc+0xe9/0xf0 sk_clone_lock+0x2a7/0x420 inet_csk_clone_lock+0x1b/0x110 tcp_create_openreq_child+0x23/0x3b0 tcp_v6_syn_recv_sock+0x88/0x730 tcp_check_req+0x429/0x560 tcp_v6_rcv+0x72d/0xa40 ip6_protocol_deliver_rcu+0xc9/0x400 ip6_input+0x44/0xd0 ? ip6_protocol_deliver_rcu+0x400/0x400 ip6_rcv_finish+0x71/0x80 ipv6_rcv+0x5b/0xe0 ? ip6_sublist_rcv+0x2e0/0x2e0 process_backlog+0x108/0x1e0 net_rx_action+0x26b/0x460 __do_softirq+0x104/0x2a6 do_softirq_own_stack+0x2a/0x40 do_softirq.part.19+0x40/0x50 __local_bh_enable_ip+0x51/0x60 ip6_finish_output2+0x23d/0x520 ? ip6table_mangle_hook+0x55/0x160 __ip6_finish_output+0xa1/0x100 ip6_finish_output+0x30/0xd0 ip6_output+0x73/0x120 ? __ip6_finish_output+0x100/0x100 ip6_xmit+0x2e3/0x600 ? ipv6_anycast_cleanup+0x50/0x50 ? inet6_csk_route_socket+0x136/0x1e0 ? skb_free_head+0x1e/0x30 inet6_csk_xmit+0x95/0xf0 __tcp_transmit_skb+0x5b4/0xb20 __tcp_send_ack.part.60+0xa3/0x110 tcp_send_ack+0x1d/0x20 tcp_rcv_state_process+0xe64/0xe80 ? tcp_v6_connect+0x5d1/0x5f0 tcp_v6_do_rcv+0x1b1/0x3f0 ? tcp_v6_do_rcv+0x1b1/0x3f0 __release_sock+0x7f/0xd0 release_sock+0x30/0xa0 __inet_stream_connect+0x1c3/0x3b0 ? prepare_to_wait+0xb0/0xb0 inet_stream_connect+0x3b/0x60 __sys_connect+0x101/0x120 ? __sys_getsockopt+0x11b/0x140 __x64_sys_connect+0x1a/0x20 do_syscall_64+0x51/0x200 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The stack trace of mem_cgroup_sk_alloc() from IRQ-context: Fixes: 2d7580738345 ("mm: memcontrol: consolidate cgroup socket tracking") Fixes: d979a39d7242 ("cgroup: duplicate cgroup reference when cloning sockets") Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Signed-off-by: David S. Miller --- kernel/cgroup/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 75f687301bbf..6b2fc56b2201 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6258,6 +6258,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) return; } + /* Don't associate the sock with unrelated interrupted task's cgroup. */ + if (in_interrupt()) + return; + rcu_read_lock(); while (true) { -- cgit v1.2.3 From d9815bff6b379ff46981bea9dfeb146081eab314 Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Fri, 6 Mar 2020 18:43:17 +0100 Subject: ftrace: Return the first found result in lookup_rec() It appears that ip ranges can overlap so. In that case lookup_rec() returns whatever results it got last even if it found nothing in last searched page. This breaks an obscure livepatch late module patching usecase: - load livepatch - load the patched module - unload livepatch - try to load livepatch again To fix this return from lookup_rec() as soon as it found the record containing searched-for ip. This used to be this way prior lookup_rec() introduction. Link: http://lkml.kernel.org/r/20200306174317.21699-1-asavkov@redhat.com Cc: stable@vger.kernel.org Fixes: 7e16f581a817 ("ftrace: Separate out functionality from ftrace_location_range()") Signed-off-by: Artem Savkov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3f7ee102868a..fd81c7de77a7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1547,6 +1547,8 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) rec = bsearch(&key, pg->records, pg->index, sizeof(struct dyn_ftrace), ftrace_cmp_recs); + if (rec) + break; } return rec; } -- cgit v1.2.3 From 8e7ae2518f5265f0ef09d561748098fde5a87ccd Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 13 Mar 2020 18:02:09 -0700 Subject: bpf: Sanitize the bpf_struct_ops tcp-cc name The bpf_struct_ops tcp-cc name should be sanitized in order to avoid problematic chars (e.g. whitespaces). This patch reuses the bpf_obj_name_cpy() for accepting the same set of characters in order to keep a consistent bpf programming experience. A "size" param is added. Also, the strlen is returned on success so that the caller (like the bpf_tcp_ca here) can error out on empty name. The existing callers of the bpf_obj_name_cpy() only need to change the testing statement to "if (err < 0)". For all these existing callers, the err will be overwritten later, so no extra change is needed for the new strlen return value. v3: - reverse xmas tree style v2: - Save the orig_src to avoid "end - size" (Andrii) Fixes: 0baf26b0fcd7 ("bpf: tcp: Support tcp_congestion_ops in bpf") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200314010209.1131542-1-kafai@fb.com --- kernel/bpf/syscall.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0c7fb0d4836d..2857b7dda382 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -696,14 +696,15 @@ int bpf_get_file_flag(int flags) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes. - * Return 0 on success and < 0 on error. +/* dst and src must have at least "size" number of bytes. + * Return strlen on success and < 0 on error. */ -static int bpf_obj_name_cpy(char *dst, const char *src) +int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) { - const char *end = src + BPF_OBJ_NAME_LEN; + const char *end = src + size; + const char *orig_src = src; - memset(dst, 0, BPF_OBJ_NAME_LEN); + memset(dst, 0, size); /* Copy all isalnum(), '_' and '.' chars. */ while (src < end && *src) { if (!isalnum(*src) && @@ -712,11 +713,11 @@ static int bpf_obj_name_cpy(char *dst, const char *src) *dst++ = *src++; } - /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */ + /* No '\0' found in "size" number of bytes */ if (src == end) return -EINVAL; - return 0; + return src - orig_src; } int map_check_no_btf(const struct bpf_map *map, @@ -810,8 +811,9 @@ static int map_create(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = bpf_obj_name_cpy(map->name, attr->map_name); - if (err) + err = bpf_obj_name_cpy(map->name, attr->map_name, + sizeof(attr->map_name)); + if (err < 0) goto free_map; atomic64_set(&map->refcnt, 1); @@ -2098,8 +2100,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) goto free_prog; prog->aux->load_time = ktime_get_boottime_ns(); - err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); - if (err) + err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, + sizeof(attr->prog_name)); + if (err < 0) goto free_prog; /* run eBPF verifier */ -- cgit v1.2.3 From 8096f229421f7b22433775e928d506f0342e5907 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 20 Mar 2020 10:48:13 +0100 Subject: bpf: Explicitly memset the bpf_attr structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For the bpf syscall, we are relying on the compiler to properly zero out the bpf_attr union that we copy userspace data into. Unfortunately that doesn't always work properly, padding and other oddities might not be correctly zeroed, and in some tests odd things have been found when the stack is pre-initialized to other values. Fix this by explicitly memsetting the structure to 0 before using it. Reported-by: Maciej Żenczykowski Reported-by: John Stultz Reported-by: Alexander Potapenko Reported-by: Alistair Delva Signed-off-by: Greg Kroah-Hartman Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://android-review.googlesource.com/c/kernel/common/+/1235490 Link: https://lore.kernel.org/bpf/20200320094813.GA421650@kroah.com --- kernel/bpf/syscall.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2857b7dda382..3ac1629cf00e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3362,7 +3362,7 @@ err_put: SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { - union bpf_attr attr = {}; + union bpf_attr attr; int err; if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) @@ -3374,6 +3374,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz size = min_t(u32, size, sizeof(attr)); /* copy attributes from user space, may be less than sizeof(bpf_attr) */ + memset(&attr, 0, sizeof(attr)); if (copy_from_user(&attr, uattr, size) != 0) return -EFAULT; -- cgit v1.2.3 From 5c6f25887963f15492b604dd25cb149c501bbabf Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 20 Mar 2020 17:22:58 +0100 Subject: bpf: Explicitly memset some bpf info structures declared on the stack Trying to initialize a structure with "= {};" will not always clean out all padding locations in a structure. So be explicit and call memset to initialize everything for a number of bpf information structures that are then copied from userspace, sometimes from smaller memory locations than the size of the structure. Reported-by: Daniel Borkmann Signed-off-by: Greg Kroah-Hartman Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200320162258.GA794295@kroah.com --- kernel/bpf/btf.c | 3 ++- kernel/bpf/syscall.c | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 32ab9225026e..7787bdcb5d68 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4564,7 +4564,7 @@ int btf_get_info_by_fd(const struct btf *btf, union bpf_attr __user *uattr) { struct bpf_btf_info __user *uinfo; - struct bpf_btf_info info = {}; + struct bpf_btf_info info; u32 info_copy, btf_copy; void __user *ubtf; u32 uinfo_len; @@ -4573,6 +4573,7 @@ int btf_get_info_by_fd(const struct btf *btf, uinfo_len = attr->info.info_len; info_copy = min_t(u32, uinfo_len, sizeof(info)); + memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_copy)) return -EFAULT; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3ac1629cf00e..966b7b34cde0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2795,7 +2795,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, union bpf_attr __user *uattr) { struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); - struct bpf_prog_info info = {}; + struct bpf_prog_info info; u32 info_len = attr->info.info_len; struct bpf_prog_stats stats; char __user *uinsns; @@ -2807,6 +2807,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return err; info_len = min_t(u32, sizeof(info), info_len); + memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_len)) return -EFAULT; @@ -3070,7 +3071,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, union bpf_attr __user *uattr) { struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); - struct bpf_map_info info = {}; + struct bpf_map_info info; u32 info_len = attr->info.info_len; int err; @@ -3079,6 +3080,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, return err; info_len = min_t(u32, sizeof(info), info_len); + memset(&info, 0, sizeof(info)); info.type = map->map_type; info.id = map->id; info.key_size = map->key_size; -- cgit v1.2.3 From df81dfcfd6991d547653d46c051bac195cd182c1 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 13 Mar 2020 20:33:07 +0000 Subject: genirq: Fix reference leaks on irq affinity notifiers The handling of notify->work did not properly maintain notify->kref in two cases: 1) where the work was already scheduled, another irq_set_affinity_locked() would get the ref and (no-op-ly) schedule the work. Thus when irq_affinity_notify() ran, it would drop the original ref but not the additional one. 2) when cancelling the (old) work in irq_set_affinity_notifier(), if there was outstanding work a ref had been got for it but was never put. Fix both by checking the return values of the work handling functions (schedule_work() for (1) and cancel_work_sync() for (2)) and put the extra ref if the return value indicates preexisting work. Fixes: cd7eab44e994 ("genirq: Add IRQ affinity notifiers") Fixes: 59c39840f5ab ("genirq: Prevent use-after-free and work list corruption") Signed-off-by: Edward Cree Signed-off-by: Thomas Gleixner Acked-by: Ben Hutchings Link: https://lkml.kernel.org/r/24f5983f-2ab5-e83a-44ee-a45b5f9300f5@solarflare.com --- kernel/irq/manage.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7eee98c38f25..fe40c658f86f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -323,7 +323,11 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); - schedule_work(&desc->affinity_notify->work); + if (!schedule_work(&desc->affinity_notify->work)) { + /* Work was already scheduled, drop our extra ref */ + kref_put(&desc->affinity_notify->kref, + desc->affinity_notify->release); + } } irqd_set(data, IRQD_AFFINITY_SET); @@ -423,7 +427,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) raw_spin_unlock_irqrestore(&desc->lock, flags); if (old_notify) { - cancel_work_sync(&old_notify->work); + if (cancel_work_sync(&old_notify->work)) { + /* Pending work had a ref, put that one too */ + kref_put(&old_notify->kref, old_notify->release); + } kref_put(&old_notify->kref, old_notify->release); } -- cgit v1.2.3 From 763802b53a427ed3cbd419dbba255c414fdd9e7c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Sat, 21 Mar 2020 18:22:41 -0700 Subject: x86/mm: split vmalloc_sync_all() Commit 3f8fd02b1bf1 ("mm/vmalloc: Sync unmappings in __purge_vmap_area_lazy()") introduced a call to vmalloc_sync_all() in the vunmap() code-path. While this change was necessary to maintain correctness on x86-32-pae kernels, it also adds additional cycles for architectures that don't need it. Specifically on x86-64 with CONFIG_VMAP_STACK=y some people reported severe performance regressions in micro-benchmarks because it now also calls the x86-64 implementation of vmalloc_sync_all() on vunmap(). But the vmalloc_sync_all() implementation on x86-64 is only needed for newly created mappings. To avoid the unnecessary work on x86-64 and to gain the performance back, split up vmalloc_sync_all() into two functions: * vmalloc_sync_mappings(), and * vmalloc_sync_unmappings() Most call-sites to vmalloc_sync_all() only care about new mappings being synchronized. The only exception is the new call-site added in the above mentioned commit. Shile Zhang directed us to a report of an 80% regression in reaim throughput. Fixes: 3f8fd02b1bf1 ("mm/vmalloc: Sync unmappings in __purge_vmap_area_lazy()") Reported-by: kernel test robot Reported-by: Shile Zhang Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Tested-by: Borislav Petkov Acked-by: Rafael J. Wysocki [GHES] Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Link: http://lkml.kernel.org/r/20191009124418.8286-1-joro@8bytes.org Link: https://lists.01.org/hyperkitty/list/lkp@lists.01.org/thread/4D3JPPHBNOSPFK2KEPC6KGKS6J25AIDB/ Link: http://lkml.kernel.org/r/20191113095530.228959-1-shile.zhang@linux.alibaba.com Signed-off-by: Linus Torvalds --- kernel/notifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index 63d7501ac638..5989bbb93039 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -519,7 +519,7 @@ NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { - vmalloc_sync_all(); + vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); -- cgit v1.2.3 From 8380ce479010f2f779587b462a9b4681934297c3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 28 Mar 2020 19:17:25 -0700 Subject: mm: fork: fix kernel_stack memcg stats for various stack implementations Depending on CONFIG_VMAP_STACK and the THREAD_SIZE / PAGE_SIZE ratio the space for task stacks can be allocated using __vmalloc_node_range(), alloc_pages_node() and kmem_cache_alloc_node(). In the first and the second cases page->mem_cgroup pointer is set, but in the third it's not: memcg membership of a slab page should be determined using the memcg_from_slab_page() function, which looks at page->slab_cache->memcg_params.memcg . In this case, using mod_memcg_page_state() (as in account_kernel_stack()) is incorrect: page->mem_cgroup pointer is NULL even for pages charged to a non-root memory cgroup. It can lead to kernel_stack per-memcg counters permanently showing 0 on some architectures (depending on the configuration). In order to fix it, let's introduce a mod_memcg_obj_state() helper, which takes a pointer to a kernel object as a first argument, uses mem_cgroup_from_obj() to get a RCU-protected memcg pointer and calls mod_memcg_state(). It allows to handle all possible configurations (CONFIG_VMAP_STACK and various THREAD_SIZE/PAGE_SIZE values) without spilling any memcg/kmem specifics into fork.c . Note: This is a special version of the patch created for stable backports. It contains code from the following two patches: - mm: memcg/slab: introduce mem_cgroup_from_obj() - mm: fork: fix kernel_stack memcg stats for various stack implementations [guro@fb.com: introduce mem_cgroup_from_obj()] Link: http://lkml.kernel.org/r/20200324004221.GA36662@carbon.dhcp.thefacebook.com Fixes: 4d96ba353075 ("mm: memcg/slab: stop setting page->mem_cgroup pointer for slab pages") Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Bharata B Rao Cc: Shakeel Butt Cc: Link: http://lkml.kernel.org/r/20200303233550.251375-1-guro@fb.com Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 86425305cd4a..d90af13431c7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -397,8 +397,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, THREAD_SIZE / 1024 * account); - mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } } -- cgit v1.2.3