diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bounds.c | 4 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 90 | ||||
-rw-r--r-- | kernel/cgroup.c | 6 | ||||
-rw-r--r-- | kernel/debug/kdb/kdb_io.c | 15 | ||||
-rw-r--r-- | kernel/debug/kdb/kdb_private.h | 2 | ||||
-rw-r--r-- | kernel/debug/kdb/kdb_support.c | 14 | ||||
-rw-r--r-- | kernel/events/core.c | 7 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 12 | ||||
-rw-r--r-- | kernel/futex.c | 12 | ||||
-rw-r--r-- | kernel/irq/manage.c | 8 | ||||
-rw-r--r-- | kernel/kprobes.c | 27 | ||||
-rw-r--r-- | kernel/locking/lockdep.c | 4 | ||||
-rw-r--r-- | kernel/module.c | 6 | ||||
-rw-r--r-- | kernel/printk/printk.c | 7 | ||||
-rw-r--r-- | kernel/sched/core.c | 4 | ||||
-rw-r--r-- | kernel/sched/cpuacct.c | 2 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 75 | ||||
-rw-r--r-- | kernel/sched/fair.c | 24 | ||||
-rw-r--r-- | kernel/sched/sched.h | 14 | ||||
-rw-r--r-- | kernel/signal.c | 2 | ||||
-rw-r--r-- | kernel/sysctl.c | 18 | ||||
-rw-r--r-- | kernel/time/alarmtimer.c | 3 | ||||
-rw-r--r-- | kernel/time/posix-cpu-timers.c | 2 | ||||
-rw-r--r-- | kernel/time/posix-timers.c | 29 | ||||
-rw-r--r-- | kernel/time/time.c | 10 | ||||
-rw-r--r-- | kernel/time/timeconst.bc | 6 |
26 files changed, 285 insertions, 118 deletions
diff --git a/kernel/bounds.c b/kernel/bounds.c index e1d1d1952bfa..c37f68d758db 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -12,7 +12,7 @@ #include <linux/log2.h> #include <linux/spinlock_types.h> -void foo(void) +int main(void) { /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); @@ -22,4 +22,6 @@ void foo(void) #endif DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); /* End of constants */ + + return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dafa2708ce9e..1438b7396cb4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -540,10 +540,11 @@ static bool is_spillable_regtype(enum bpf_reg_type type) /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ -static int check_stack_write(struct bpf_verifier_state *state, int off, - int size, int value_regno) +static int check_stack_write(struct bpf_verifier_env *env, + struct bpf_verifier_state *state, int off, + int size, int value_regno, int insn_idx) { - int i; + int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, * so it's aligned access and [off, off + size) are within stack limits */ @@ -558,15 +559,37 @@ static int check_stack_write(struct bpf_verifier_state *state, int off, } /* save register state */ - state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = - state->regs[value_regno]; - - for (i = 0; i < BPF_REG_SIZE; i++) + state->spilled_regs[spi] = state->regs[value_regno]; + + for (i = 0; i < BPF_REG_SIZE; i++) { + if (state->stack_slot_type[MAX_BPF_STACK + off + i] == STACK_MISC && + !env->allow_ptr_leaks) { + int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; + int soff = (-spi - 1) * BPF_REG_SIZE; + + /* detected reuse of integer stack slot with a pointer + * which means either llvm is reusing stack slot or + * an attacker is trying to exploit CVE-2018-3639 + * (speculative store bypass) + * Have to sanitize that slot with preemptive + * store of zero. + */ + if (*poff && *poff != soff) { + /* disallow programs where single insn stores + * into two different stack slots, since verifier + * cannot sanitize them + */ + verbose("insn %d cannot access two stack slots fp%d and fp%d", + insn_idx, *poff, soff); + return -EINVAL; + } + *poff = soff; + } state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; + } } else { /* regular write of data into stack */ - state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = - (struct bpf_reg_state) {}; + state->spilled_regs[spi] = (struct bpf_reg_state) {}; for (i = 0; i < size; i++) state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; @@ -747,7 +770,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, +static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, int bpf_size, enum bpf_access_type t, int value_regno) { @@ -843,7 +866,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, verbose("attempt to corrupt spilled pointer on stack\n"); return -EACCES; } - err = check_stack_write(state, off, size, value_regno); + err = check_stack_write(env, state, off, size, + value_regno, insn_idx); } else { err = check_stack_read(state, off, size, value_regno); } @@ -877,7 +901,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, return err; } -static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) +static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { struct bpf_reg_state *regs = env->cur_state.regs; int err; @@ -910,13 +934,13 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check whether atomic_add can read the memory */ - err = check_mem_access(env, insn->dst_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); if (err) return err; /* check whether atomic_add can write into the same memory */ - return check_mem_access(env, insn->dst_reg, insn->off, + return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1); } @@ -1272,7 +1296,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) * is inferred from register state. */ for (i = 0; i < meta.access_size; i++) { - err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1); + err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1); if (err) return err; } @@ -2938,7 +2962,7 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (src_reg + off) is readable, * the state of dst_reg will be updated by this func */ - err = check_mem_access(env, insn->src_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg); if (err) @@ -2978,7 +3002,7 @@ static int do_check(struct bpf_verifier_env *env) enum bpf_reg_type *prev_dst_type, dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, insn); + err = check_xadd(env, insn_idx, insn); if (err) return err; insn_idx++; @@ -2997,7 +3021,7 @@ static int do_check(struct bpf_verifier_env *env) dst_reg_type = regs[insn->dst_reg].type; /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn->dst_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg); if (err) @@ -3032,7 +3056,7 @@ static int do_check(struct bpf_verifier_env *env) } /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn->dst_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1); if (err) @@ -3369,6 +3393,34 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) else continue; + if (type == BPF_WRITE && + env->insn_aux_data[i + delta].sanitize_stack_off) { + struct bpf_insn patch[] = { + /* Sanitize suspicious stack slot with zero. + * There are no memory dependencies for this store, + * since it's only using frame pointer and immediate + * constant of zero + */ + BPF_ST_MEM(BPF_DW, BPF_REG_FP, + env->insn_aux_data[i + delta].sanitize_stack_off, + 0), + /* the original STX instruction will immediately + * overwrite the same stack slot with appropriate value + */ + *insn, + }; + + cnt = ARRAY_SIZE(patch); + new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) continue; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4c233437ee1a..bb0cf1caf1cd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4386,7 +4386,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) */ do { css_task_iter_start(&from->self, &it); - task = css_task_iter_next(&it); + + do { + task = css_task_iter_next(&it); + } while (task && (task->flags & PF_EXITING)); + if (task) get_task_struct(task); css_task_iter_end(&it); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 77777d918676..cc892a9e109d 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -215,7 +215,7 @@ static char *kdb_read(char *buffer, size_t bufsize) int count; int i; int diag, dtab_count; - int key; + int key, buf_size, ret; diag = kdbgetintenv("DTABCOUNT", &dtab_count); @@ -335,9 +335,8 @@ poll_again: else p_tmp = tmpbuffer; len = strlen(p_tmp); - count = kallsyms_symbol_complete(p_tmp, - sizeof(tmpbuffer) - - (p_tmp - tmpbuffer)); + buf_size = sizeof(tmpbuffer) - (p_tmp - tmpbuffer); + count = kallsyms_symbol_complete(p_tmp, buf_size); if (tab == 2 && count > 0) { kdb_printf("\n%d symbols are found.", count); if (count > dtab_count) { @@ -349,9 +348,13 @@ poll_again: } kdb_printf("\n"); for (i = 0; i < count; i++) { - if (WARN_ON(!kallsyms_symbol_next(p_tmp, i))) + ret = kallsyms_symbol_next(p_tmp, i, buf_size); + if (WARN_ON(!ret)) break; - kdb_printf("%s ", p_tmp); + if (ret != -E2BIG) + kdb_printf("%s ", p_tmp); + else + kdb_printf("%s... ", p_tmp); *(p_tmp + len) = '\0'; } if (i >= dtab_count) diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 75014d7f4568..533e04e75a9c 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -83,7 +83,7 @@ typedef struct __ksymtab { unsigned long sym_start; unsigned long sym_end; } kdb_symtab_t; -extern int kallsyms_symbol_next(char *prefix_name, int flag); +extern int kallsyms_symbol_next(char *prefix_name, int flag, int buf_size); extern int kallsyms_symbol_complete(char *prefix_name, int max_len); /* Exported Symbols for kernel loadable modules to use. */ diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index d35cc2d3a4cc..61cd704a21c8 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -129,13 +129,13 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) } if (i >= ARRAY_SIZE(kdb_name_table)) { debug_kfree(kdb_name_table[0]); - memcpy(kdb_name_table, kdb_name_table+1, + memmove(kdb_name_table, kdb_name_table+1, sizeof(kdb_name_table[0]) * (ARRAY_SIZE(kdb_name_table)-1)); } else { debug_kfree(knt1); knt1 = kdb_name_table[i]; - memcpy(kdb_name_table+i, kdb_name_table+i+1, + memmove(kdb_name_table+i, kdb_name_table+i+1, sizeof(kdb_name_table[0]) * (ARRAY_SIZE(kdb_name_table)-i-1)); } @@ -221,11 +221,13 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len) * Parameters: * prefix_name prefix of a symbol name to lookup * flag 0 means search from the head, 1 means continue search. + * buf_size maximum length that can be written to prefix_name + * buffer * Returns: * 1 if a symbol matches the given prefix. * 0 if no string found */ -int kallsyms_symbol_next(char *prefix_name, int flag) +int kallsyms_symbol_next(char *prefix_name, int flag, int buf_size) { int prefix_len = strlen(prefix_name); static loff_t pos; @@ -235,10 +237,8 @@ int kallsyms_symbol_next(char *prefix_name, int flag) pos = 0; while ((name = kdb_walk_kallsyms(&pos))) { - if (strncmp(name, prefix_name, prefix_len) == 0) { - strncpy(prefix_name, name, strlen(name)+1); - return 1; - } + if (!strncmp(name, prefix_name, prefix_len)) + return strscpy(prefix_name, name, buf_size); } return 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 95bd00d9f2c3..1af0bbf20984 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4331,7 +4331,9 @@ EXPORT_SYMBOL_GPL(perf_event_read_value); static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { + struct perf_event_context *ctx = leader->ctx; struct perf_event *sub; + unsigned long flags; int n = 1; /* skip @nr */ int ret; @@ -4361,12 +4363,15 @@ static int __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); + raw_spin_lock_irqsave(&ctx->lock, flags); + list_for_each_entry(sub, &leader->sibling_list, group_entry) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); } + raw_spin_unlock_irqrestore(&ctx->lock, flags); return 0; } @@ -7737,6 +7742,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, goto unlock; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->cpu != smp_processor_id()) + continue; if (event->attr.type != PERF_TYPE_TRACEPOINT) continue; if (event->attr.config != entry->type) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a1de021dccba..fbfab5722254 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -608,7 +608,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, BUG_ON((uprobe->offset & ~PAGE_MASK) + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - smp_wmb(); /* pairs with rmb() in find_active_uprobe() */ + smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */ set_bit(UPROBE_COPY_INSN, &uprobe->flags); out: @@ -1902,10 +1902,18 @@ static void handle_swbp(struct pt_regs *regs) * After we hit the bp, _unregister + _register can install the * new and not-yet-analyzed uprobe at the same address, restart. */ - smp_rmb(); /* pairs with wmb() in install_breakpoint() */ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; + /* + * Pairs with the smp_wmb() in prepare_uprobe(). + * + * Guarantees that if we see the UPROBE_COPY_INSN bit set, then + * we must also see the stores to &uprobe->arch performed by the + * prepare_uprobe() call. + */ + smp_rmb(); + /* Tracing handlers use ->utask to communicate with fetch methods */ if (!get_utask()) goto out; diff --git a/kernel/futex.c b/kernel/futex.c index c3ea6f2a6997..053d7be08be5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1467,8 +1467,16 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) int oldval, ret; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { - if (oparg < 0 || oparg > 31) - return -EINVAL; + if (oparg < 0 || oparg > 31) { + char comm[sizeof(current->comm)]; + /* + * kill this print and return -EINVAL when userspace + * is sane again + */ + pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", + get_task_comm(comm, current), oparg); + oparg &= 31; + } oparg = 1 << oparg; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e121645bb8a1..cf94460504bb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -878,6 +878,9 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) local_bh_disable(); ret = action->thread_fn(action->irq, action->dev_id); + if (ret == IRQ_HANDLED) + atomic_inc(&desc->threads_handled); + irq_finalize_oneshot(desc, action); local_bh_enable(); return ret; @@ -894,6 +897,9 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, irqreturn_t ret; ret = action->thread_fn(action->irq, action->dev_id); + if (ret == IRQ_HANDLED) + atomic_inc(&desc->threads_handled); + irq_finalize_oneshot(desc, action); return ret; } @@ -971,8 +977,6 @@ static int irq_thread(void *data) irq_thread_check_affinity(desc, action); action_ret = handler_fn(desc, action); - if (action_ret == IRQ_HANDLED) - atomic_inc(&desc->threads_handled); if (action_ret == IRQ_WAKE_THREAD) irq_wake_secondary(desc, action); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b9e966bcdd20..f580352cc6e5 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -665,9 +665,10 @@ static void unoptimize_kprobe(struct kprobe *p, bool force) } /* Cancel unoptimizing for reusing */ -static void reuse_unused_kprobe(struct kprobe *ap) +static int reuse_unused_kprobe(struct kprobe *ap) { struct optimized_kprobe *op; + int ret; BUG_ON(!kprobe_unused(ap)); /* @@ -681,8 +682,12 @@ static void reuse_unused_kprobe(struct kprobe *ap) /* Enable the probe again */ ap->flags &= ~KPROBE_FLAG_DISABLED; /* Optimize it again (remove from op->list) */ - BUG_ON(!kprobe_optready(ap)); + ret = kprobe_optready(ap); + if (ret) + return ret; + optimize_kprobe(ap); + return 0; } /* Remove optimized instructions */ @@ -894,11 +899,16 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt) #define kprobe_disarmed(p) kprobe_disabled(p) #define wait_for_kprobe_optimizer() do {} while (0) -/* There should be no unused kprobes can be reused without optimization */ -static void reuse_unused_kprobe(struct kprobe *ap) +static int reuse_unused_kprobe(struct kprobe *ap) { + /* + * If the optimized kprobe is NOT supported, the aggr kprobe is + * released at the same time that the last aggregated kprobe is + * unregistered. + * Thus there should be no chance to reuse unused kprobe. + */ printk(KERN_ERR "Error: There should be no unused kprobe here.\n"); - BUG_ON(kprobe_unused(ap)); + return -EINVAL; } static void free_aggr_kprobe(struct kprobe *p) @@ -1276,9 +1286,12 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) goto out; } init_aggr_kprobe(ap, orig_p); - } else if (kprobe_unused(ap)) + } else if (kprobe_unused(ap)) { /* This probe is going to die. Rescue it */ - reuse_unused_kprobe(ap); + ret = reuse_unused_kprobe(ap); + if (ret) + goto out; + } if (kprobe_gone(ap)) { /* diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 61a15e538435..26fc428476b9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4010,7 +4010,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(!lock_stat)) + if (unlikely(!lock_stat || !debug_locks)) return; if (unlikely(current->lockdep_recursion)) @@ -4030,7 +4030,7 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(!lock_stat)) + if (unlikely(!lock_stat || !debug_locks)) return; if (unlikely(current->lockdep_recursion)) diff --git a/kernel/module.c b/kernel/module.c index 0651f2d25fc9..2325c9821f2a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4011,7 +4011,7 @@ static unsigned long mod_find_symname(struct module *mod, const char *name) for (i = 0; i < kallsyms->num_symtab; i++) if (strcmp(name, symname(kallsyms, i)) == 0 && - kallsyms->symtab[i].st_info != 'U') + kallsyms->symtab[i].st_shndx != SHN_UNDEF) return kallsyms->symtab[i].st_value; return 0; } @@ -4057,6 +4057,10 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, if (mod->state == MODULE_STATE_UNFORMED) continue; for (i = 0; i < kallsyms->num_symtab; i++) { + + if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) + continue; + ret = fn(data, symname(kallsyms, i), mod, kallsyms->symtab[i].st_value); if (ret != 0) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9ae22e845029..09f9842f9f42 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1010,7 +1010,12 @@ static void __init log_buf_len_update(unsigned size) /* save requested log_buf_len since it's too early to process it */ static int __init log_buf_len_setup(char *str) { - unsigned size = memparse(str, &str); + unsigned int size; + + if (!str) + return -EINVAL; + + size = memparse(str, &str); log_buf_len_update(size); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 917be221438b..6b3fff6a6437 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4087,8 +4087,8 @@ static int __sched_setscheduler(struct task_struct *p, int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; struct rq *rq; - /* may grab non-irq protected spin_locks */ - BUG_ON(in_interrupt()); + /* The pi code expects interrupts enabled */ + BUG_ON(pi && in_interrupt()); recheck: /* double check policy once rq lock held */ if (policy < 0) { diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index bc0b309c3f19..4c882791c10c 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[stat], - cputime64_to_clock_t(val[stat])); + nsec_to_clock_t(val[stat])); } return 0; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5ebee3164e64..448d6426fa5f 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -37,6 +37,18 @@ void disable_sched_clock_irqtime(void) sched_clock_irqtime = 0; } +static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, + enum cpu_usage_stat idx) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + u64_stats_update_begin(&irqtime->sync); + cpustat[idx] += delta; + irqtime->total += delta; + irqtime->tick_delta += delta; + u64_stats_update_end(&irqtime->sync); +} + /* * Called before incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. @@ -54,7 +66,6 @@ void irqtime_account_irq(struct task_struct *curr) delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; irqtime->irq_start_time += delta; - u64_stats_update_begin(&irqtime->sync); /* * We do not account for softirq time from ksoftirqd here. * We want to continue accounting softirq time to ksoftirqd thread @@ -62,48 +73,29 @@ void irqtime_account_irq(struct task_struct *curr) * that do not consume any time, but still wants to run. */ if (hardirq_count()) - irqtime->hardirq_time += delta; + irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) - irqtime->softirq_time += delta; - - u64_stats_update_end(&irqtime->sync); + irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); } EXPORT_SYMBOL_GPL(irqtime_account_irq); -static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime) +static cputime_t irqtime_tick_accounted(cputime_t maxtime) { - u64 *cpustat = kcpustat_this_cpu->cpustat; - cputime_t irq_cputime; - - irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx]; - irq_cputime = min(irq_cputime, maxtime); - cpustat[idx] += irq_cputime; - - return irq_cputime; -} + struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); + cputime_t delta; -static cputime_t irqtime_account_hi_update(cputime_t maxtime) -{ - return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time), - CPUTIME_IRQ, maxtime); -} + delta = nsecs_to_cputime(irqtime->tick_delta); + delta = min(delta, maxtime); + irqtime->tick_delta -= cputime_to_nsecs(delta); -static cputime_t irqtime_account_si_update(cputime_t maxtime) -{ - return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time), - CPUTIME_SOFTIRQ, maxtime); + return delta; } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ #define sched_clock_irqtime (0) -static cputime_t irqtime_account_hi_update(cputime_t dummy) -{ - return 0; -} - -static cputime_t irqtime_account_si_update(cputime_t dummy) +static cputime_t irqtime_tick_accounted(cputime_t dummy) { return 0; } @@ -143,7 +135,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime, index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; /* Add user time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); + task_group_account_field(p, index, cputime_to_nsecs(cputime)); /* Account for user time used */ acct_account_cputime(p); @@ -168,11 +160,11 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, /* Add guest time to cpustat. */ if (task_nice(p) > 0) { - cpustat[CPUTIME_NICE] += (__force u64) cputime; - cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; + cpustat[CPUTIME_NICE] += cputime_to_nsecs(cputime); + cpustat[CPUTIME_GUEST_NICE] += cputime_to_nsecs(cputime); } else { - cpustat[CPUTIME_USER] += (__force u64) cputime; - cpustat[CPUTIME_GUEST] += (__force u64) cputime; + cpustat[CPUTIME_USER] += cputime_to_nsecs(cputime); + cpustat[CPUTIME_GUEST] += cputime_to_nsecs(cputime); } } @@ -193,7 +185,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, account_group_system_time(p, cputime); /* Add system time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); + task_group_account_field(p, index, cputime_to_nsecs(cputime)); /* Account for system time used */ acct_account_cputime(p); @@ -234,7 +226,7 @@ void account_steal_time(cputime_t cputime) { u64 *cpustat = kcpustat_this_cpu->cpustat; - cpustat[CPUTIME_STEAL] += (__force u64) cputime; + cpustat[CPUTIME_STEAL] += cputime_to_nsecs(cputime); } /* @@ -247,9 +239,9 @@ void account_idle_time(cputime_t cputime) struct rq *rq = this_rq(); if (atomic_read(&rq->nr_iowait) > 0) - cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; + cpustat[CPUTIME_IOWAIT] += cputime_to_nsecs(cputime); else - cpustat[CPUTIME_IDLE] += (__force u64) cputime; + cpustat[CPUTIME_IDLE] += cputime_to_nsecs(cputime); } /* @@ -290,10 +282,7 @@ static inline cputime_t account_other_time(cputime_t max) accounted = steal_account_process_time(max); if (accounted < max) - accounted += irqtime_account_hi_update(max - accounted); - - if (accounted < max) - accounted += irqtime_account_si_update(max - accounted); + accounted += irqtime_tick_accounted(max - accounted); return accounted; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f81adb476c03..0c91d72f3e8f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3502,7 +3502,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * put back on, and if we advance min_vruntime, we'll be placed back * further than we started -- ie. we'll be penalized. */ - if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); } @@ -3976,9 +3976,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) /* * Add to the _head_ of the list, so that an already-started - * distribute_cfs_runtime will not see us + * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is + * not running add to the tail so that later runqueues don't get starved. */ - list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + if (cfs_b->distribute_running) + list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + else + list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); /* * If we're the first throttled task, make sure the bandwidth @@ -4121,14 +4125,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) * in us over-using our runtime if it is all used during this loop, but * only by limited amounts in that extreme case. */ - while (throttled && cfs_b->runtime > 0) { + while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { runtime = cfs_b->runtime; + cfs_b->distribute_running = 1; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, runtime_expires); raw_spin_lock(&cfs_b->lock); + cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); cfs_b->runtime -= min(runtime, cfs_b->runtime); @@ -4239,6 +4245,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* confirm we're still not at a refresh boundary */ raw_spin_lock(&cfs_b->lock); + if (cfs_b->distribute_running) { + raw_spin_unlock(&cfs_b->lock); + return; + } + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { raw_spin_unlock(&cfs_b->lock); return; @@ -4248,6 +4259,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) runtime = cfs_b->runtime; expires = cfs_b->runtime_expires; + if (runtime) + cfs_b->distribute_running = 1; + raw_spin_unlock(&cfs_b->lock); if (!runtime) @@ -4258,6 +4272,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) raw_spin_lock(&cfs_b->lock); if (expires == cfs_b->runtime_expires) cfs_b->runtime -= min(runtime, cfs_b->runtime); + cfs_b->distribute_running = 0; raw_spin_unlock(&cfs_b->lock); } @@ -4366,6 +4381,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->period_timer.function = sched_cfs_period_timer; hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; + cfs_b->distribute_running = 0; } static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f564a1d2c9d5..ec6e838e991a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -4,6 +4,7 @@ #include <linux/sched/rt.h> #include <linux/u64_stats_sync.h> #include <linux/sched/deadline.h> +#include <linux/kernel_stat.h> #include <linux/binfmts.h> #include <linux/mutex.h> #include <linux/spinlock.h> @@ -254,6 +255,8 @@ struct cfs_bandwidth { /* statistics */ int nr_periods, nr_throttled; u64 throttled_time; + + bool distribute_running; #endif }; @@ -1742,14 +1745,19 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { } #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { - u64 hardirq_time; - u64 softirq_time; + u64 total; + u64 tick_delta; u64 irq_start_time; struct u64_stats_sync sync; }; DECLARE_PER_CPU(struct irqtime, cpu_irqtime); +/* + * Returns the irqtime minus the softirq time computed by ksoftirqd. + * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime + * and never move forward. + */ static inline u64 irq_time_read(int cpu) { struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); @@ -1758,7 +1766,7 @@ static inline u64 irq_time_read(int cpu) do { seq = __u64_stats_fetch_begin(&irqtime->sync); - total = irqtime->softirq_time + irqtime->hardirq_time; + total = irqtime->total; } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); return total; diff --git a/kernel/signal.c b/kernel/signal.c index 4364e57e6038..424306163edc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -991,7 +991,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, - from_ancestor_ns || (info == SEND_SIG_FORCED))) + from_ancestor_ns || (info == SEND_SIG_PRIV) || (info == SEND_SIG_FORCED))) goto ret; pending = group ? &t->signal->shared_pending : &t->pending; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7df6be31be36..23f658d311c0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1795,6 +1795,24 @@ static struct ctl_table fs_table[] = { .extra2 = &one, }, { + .procname = "protected_fifos", + .data = &sysctl_protected_fifos, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, + { + .procname = "protected_regular", + .data = &sysctl_protected_regular, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, + { .procname = "suid_dumpable", .data = &suid_dumpable, .maxlen = sizeof(int), diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index d67ef56ca9bc..a0ee81f49a87 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -786,7 +786,8 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { ktime_t now = alarm_bases[type].gettime(); - exp = ktime_add(now, exp); + + exp = ktime_add_safe(now, exp); } if (alarmtimer_do_nsleep(&alarm, exp)) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 39008d78927a..21a27bb73587 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -103,7 +103,7 @@ static void bump_cpu_timer(struct k_itimer *timer, continue; timer->it.cpu.expires += incr; - timer->it_overrun += 1 << i; + timer->it_overrun += 1LL << i; delta -= incr; } } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index fc7c37ad90a0..0e6ed2e7d066 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -355,6 +355,17 @@ static __init int init_posix_timers(void) __initcall(init_posix_timers); +/* + * The siginfo si_overrun field and the return value of timer_getoverrun(2) + * are of type int. Clamp the overrun value to INT_MAX + */ +static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval) +{ + s64 sum = timr->it_overrun_last + (s64)baseval; + + return sum > (s64)INT_MAX ? INT_MAX : (int)sum; +} + static void schedule_next_timer(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; @@ -362,12 +373,11 @@ static void schedule_next_timer(struct k_itimer *timr) if (timr->it.real.interval.tv64 == 0) return; - timr->it_overrun += (unsigned int) hrtimer_forward(timer, - timer->base->get_time(), - timr->it.real.interval); + timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), + timr->it.real.interval); timr->it_overrun_last = timr->it_overrun; - timr->it_overrun = -1; + timr->it_overrun = -1LL; ++timr->it_requeue_pending; hrtimer_restart(timer); } @@ -396,7 +406,7 @@ void do_schedule_next_timer(struct siginfo *info) else schedule_next_timer(timr); - info->si_overrun += timr->it_overrun_last; + info->si_overrun = timer_overrun_to_int(timr, info->si_overrun); } if (timr) @@ -491,8 +501,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) now = ktime_add(now, kj); } #endif - timr->it_overrun += (unsigned int) - hrtimer_forward(timer, now, + timr->it_overrun += hrtimer_forward(timer, now, timr->it.real.interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; @@ -633,7 +642,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, it_id_set = IT_ID_SET; new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; - new_timer->it_overrun = -1; + new_timer->it_overrun = -1LL; if (timer_event_spec) { if (copy_from_user(&event, timer_event_spec, sizeof (event))) { @@ -762,7 +771,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) */ if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING || timr->it_sigev_notify == SIGEV_NONE)) - timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); + timr->it_overrun += hrtimer_forward(timer, now, iv); remaining = __hrtimer_expires_remaining_adjusted(timer, now); /* Return 0 only, when the timer is expired and not pending */ @@ -824,7 +833,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) if (!timr) return -EINVAL; - overrun = timr->it_overrun_last; + overrun = timer_overrun_to_int(timr, 0); unlock_timer(timr, flags); return overrun; diff --git a/kernel/time/time.c b/kernel/time/time.c index 39468651a064..a5b6d98ea7b1 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -704,6 +704,16 @@ u64 nsec_to_clock_t(u64 x) #endif } +u64 jiffies64_to_nsecs(u64 j) +{ +#if !(NSEC_PER_SEC % HZ) + return (NSEC_PER_SEC / HZ) * j; +# else + return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN); +#endif +} +EXPORT_SYMBOL(jiffies64_to_nsecs); + /** * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 * diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc index c48688904f9f..f83bbb81600b 100644 --- a/kernel/time/timeconst.bc +++ b/kernel/time/timeconst.bc @@ -98,6 +98,12 @@ define timeconst(hz) { print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n" print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n" print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n" + + cd=gcd(hz,1000000000) + print "#define HZ_TO_NSEC_NUM\t\t", 1000000000/cd, "\n" + print "#define HZ_TO_NSEC_DEN\t\t", hz/cd, "\n" + print "#define NSEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define NSEC_TO_HZ_DEN\t\t", 1000000000/cd, "\n" print "\n" print "#endif /* KERNEL_TIMECONST_H */\n" |