diff options
Diffstat (limited to 'kernel')
33 files changed, 351 insertions, 122 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 83b416af4da1..c281f5b8705e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2121,6 +2121,7 @@ bool bpf_prog_map_compatible(struct bpf_map *map, { enum bpf_prog_type prog_type = resolve_prog_type(fp); bool ret; + struct bpf_prog_aux *aux = fp->aux; if (fp->kprobe_override) return false; @@ -2132,12 +2133,26 @@ bool bpf_prog_map_compatible(struct bpf_map *map, */ map->owner.type = prog_type; map->owner.jited = fp->jited; - map->owner.xdp_has_frags = fp->aux->xdp_has_frags; + map->owner.xdp_has_frags = aux->xdp_has_frags; + map->owner.attach_func_proto = aux->attach_func_proto; ret = true; } else { ret = map->owner.type == prog_type && map->owner.jited == fp->jited && - map->owner.xdp_has_frags == fp->aux->xdp_has_frags; + map->owner.xdp_has_frags == aux->xdp_has_frags; + if (ret && + map->owner.attach_func_proto != aux->attach_func_proto) { + switch (prog_type) { + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_STRUCT_OPS: + ret = false; + break; + default: + break; + } + } } spin_unlock(&map->owner.lock); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index dae9ed02a75b..06bc7f26be06 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2172,7 +2172,7 @@ static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_f b = &htab->buckets[i]; rcu_read_lock(); head = &b->head; - hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { + hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) { key = elem->key; if (is_percpu) { /* current cpu value for percpu map */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7a4004f09bae..b145f3ef3695 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -813,7 +813,7 @@ static const struct vm_operations_struct bpf_map_default_vmops = { static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) { struct bpf_map *map = filp->private_data; - int err; + int err = 0; if (!map->ops->map_mmap || map_value_has_spin_lock(map) || map_value_has_timer(map) || map_value_has_kptrs(map)) @@ -838,7 +838,12 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) err = -EACCES; goto out; } + bpf_map_write_active_inc(map); } +out: + mutex_unlock(&map->freeze_mutex); + if (err) + return err; /* set default open/close callbacks */ vma->vm_ops = &bpf_map_default_vmops; @@ -849,13 +854,11 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_flags &= ~VM_MAYWRITE; err = map->ops->map_mmap(map, vma); - if (err) - goto out; + if (err) { + if (vma->vm_flags & VM_WRITE) + bpf_map_write_active_dec(map); + } - if (vma->vm_flags & VM_MAYWRITE) - bpf_map_write_active_inc(map); -out: - mutex_unlock(&map->freeze_mutex); return err; } @@ -4002,6 +4005,8 @@ static int bpf_prog_get_info_by_fd(struct file *file, info.recursion_misses = stats.misses; info.verified_insns = prog->aux->verified_insns; + if (prog->aux->btf) + info.btf_id = btf_obj_id(prog->aux->btf); if (!bpf_capable()) { info.jited_prog_len = 0; @@ -4148,8 +4153,6 @@ static int bpf_prog_get_info_by_fd(struct file *file, } } - if (prog->aux->btf) - info.btf_id = btf_obj_id(prog->aux->btf); info.attach_btf_id = prog->aux->attach_btf_id; if (attach_btf) info.attach_btf_obj_id = btf_obj_id(attach_btf); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6e54f0daebef..7997c8021b62 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -90,7 +90,7 @@ DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); -#ifdef CONFIG_PROVE_RCU +#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP) EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 6ea80ae42622..24d96a2fe062 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -69,8 +69,7 @@ struct cma *dma_contiguous_default_area; * Users, who want to set the size of global CMA area for their system * should use cma= kernel parameter. */ -static const phys_addr_t size_bytes __initconst = - (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; +#define size_bytes ((phys_addr_t)CMA_SIZE_MBYTES * SZ_1M) static phys_addr_t size_cmdline __initdata = -1; static phys_addr_t base_cmdline __initdata; static phys_addr_t limit_cmdline __initdata; diff --git a/kernel/events/core.c b/kernel/events/core.c index 8fc2bc5646ee..552bb00bfceb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1180,6 +1180,12 @@ static void perf_event_ctx_deactivate(struct perf_event_context *ctx) list_del_init(&ctx->active_ctx_list); } +static inline void perf_pmu_read(struct perf_event *event) +{ + if (event->state == PERF_EVENT_STATE_ACTIVE) + event->pmu->read(event); +} + static void get_ctx(struct perf_event_context *ctx) { refcount_inc(&ctx->refcount); @@ -3389,8 +3395,7 @@ static void __perf_event_sync_stat(struct perf_event *event, * we know the event must be on the current CPU, therefore we * don't need to use it. */ - if (event->state == PERF_EVENT_STATE_ACTIVE) - event->pmu->read(event); + perf_pmu_read(event); perf_event_update_time(event); @@ -4444,15 +4449,8 @@ static void __perf_event_read(void *info) pmu->read(event); - for_each_sibling_event(sub, event) { - if (sub->state == PERF_EVENT_STATE_ACTIVE) { - /* - * Use sibling's PMU rather than @event's since - * sibling could be on different (eg: software) PMU. - */ - sub->pmu->read(sub); - } - } + for_each_sibling_event(sub, event) + perf_pmu_read(sub); data->ret = pmu->commit_txn(pmu); @@ -7101,9 +7099,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; - if ((leader != event) && - (leader->state == PERF_EVENT_STATE_ACTIVE)) - leader->pmu->read(leader); + if ((leader != event) && !handle->skip_read) + perf_pmu_read(leader); values[n++] = perf_event_count(leader); if (read_format & PERF_FORMAT_ID) @@ -7116,9 +7113,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, for_each_sibling_event(sub, leader) { n = 0; - if ((sub != event) && - (sub->state == PERF_EVENT_STATE_ACTIVE)) - sub->pmu->read(sub); + if ((sub != event) && !handle->skip_read) + perf_pmu_read(sub); values[n++] = perf_event_count(sub); if (read_format & PERF_FORMAT_ID) @@ -7173,6 +7169,9 @@ void perf_output_sample(struct perf_output_handle *handle, { u64 sample_type = data->type; + if (data->sample_flags & PERF_SAMPLE_READ) + handle->skip_read = 1; + perf_output_put(handle, *header); if (sample_type & PERF_SAMPLE_IDENTIFIER) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index c3797701339c..382a3b04f6d3 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -978,9 +978,10 @@ static int hw_breakpoint_event_init(struct perf_event *bp) return -ENOENT; /* - * no branch sampling for breakpoint events + * Check if breakpoint type is supported before proceeding. + * Also, no branch sampling for breakpoint events. */ - if (has_branch_stack(bp)) + if (!hw_breakpoint_slots_cached(find_slot_idx(bp->attr.bp_type)) || has_branch_stack(bp)) return -EOPNOTSUPP; err = register_perf_hw_breakpoint(bp); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 3e1655374c2e..4c4894de6e5d 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -181,6 +181,7 @@ __perf_output_begin(struct perf_output_handle *handle, handle->rb = rb; handle->event = event; + handle->flags = 0; have_lost = local_read(&rb->lost); if (unlikely(have_lost)) { diff --git a/kernel/fork.c b/kernel/fork.c index 09a935724bd9..8cc313d27188 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1943,6 +1943,91 @@ const struct file_operations pidfd_fops = { #endif }; +/** + * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * @pid: the struct pid for which to create a pidfd + * @flags: flags of the new @pidfd + * @pidfd: the pidfd to return + * + * Allocate a new file that stashes @pid and reserve a new pidfd number in the + * caller's file descriptor table. The pidfd is reserved but not installed yet. + + * The helper doesn't perform checks on @pid which makes it useful for pidfds + * created via CLONE_PIDFD where @pid has no task attached when the pidfd and + * pidfd file are prepared. + * + * If this function returns successfully the caller is responsible to either + * call fd_install() passing the returned pidfd and pidfd file as arguments in + * order to install the pidfd into its file descriptor table or they must use + * put_unused_fd() and fput() on the returned pidfd and pidfd file + * respectively. + * + * This function is useful when a pidfd must already be reserved but there + * might still be points of failure afterwards and the caller wants to ensure + * that no pidfd is leaked into its file descriptor table. + * + * Return: On success, a reserved pidfd is returned from the function and a new + * pidfd file is returned in the last argument to the function. On + * error, a negative error code is returned from the function and the + * last argument remains unchanged. + */ +static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +{ + int pidfd; + struct file *pidfd_file; + + if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) + return -EINVAL; + + pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (pidfd < 0) + return pidfd; + + pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, + flags | O_RDWR | O_CLOEXEC); + if (IS_ERR(pidfd_file)) { + put_unused_fd(pidfd); + return PTR_ERR(pidfd_file); + } + get_pid(pid); /* held by pidfd_file now */ + *ret = pidfd_file; + return pidfd; +} + +/** + * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * @pid: the struct pid for which to create a pidfd + * @flags: flags of the new @pidfd + * @pidfd: the pidfd to return + * + * Allocate a new file that stashes @pid and reserve a new pidfd number in the + * caller's file descriptor table. The pidfd is reserved but not installed yet. + * + * The helper verifies that @pid is used as a thread group leader. + * + * If this function returns successfully the caller is responsible to either + * call fd_install() passing the returned pidfd and pidfd file as arguments in + * order to install the pidfd into its file descriptor table or they must use + * put_unused_fd() and fput() on the returned pidfd and pidfd file + * respectively. + * + * This function is useful when a pidfd must already be reserved but there + * might still be points of failure afterwards and the caller wants to ensure + * that no pidfd is leaked into its file descriptor table. + * + * Return: On success, a reserved pidfd is returned from the function and a new + * pidfd file is returned in the last argument to the function. On + * error, a negative error code is returned from the function and the + * last argument remains unchanged. + */ +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +{ + if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) + return -EINVAL; + + return __pidfd_prepare(pid, flags, ret); +} + static void __delayed_free_task(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); @@ -2293,21 +2378,12 @@ static __latent_entropy struct task_struct *copy_process( * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { - retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + /* Note that no task has been attached to @pid yet. */ + retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile); if (retval < 0) goto bad_fork_free_pid; - pidfd = retval; - pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, - O_RDWR | O_CLOEXEC); - if (IS_ERR(pidfile)) { - put_unused_fd(pidfd); - retval = PTR_ERR(pidfile); - goto bad_fork_free_pid; - } - get_pid(pid); /* held by pidfile now */ - retval = put_user(pidfd, args->pidfd); if (retval) goto bad_fork_put_pidfd; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7fb9731bc5f5..5f8ce961cd9a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -6021,6 +6021,9 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) hlist_del_rcu(&class->hash_entry); WRITE_ONCE(class->key, NULL); WRITE_ONCE(class->name, NULL); + /* Class allocated but not used, -1 in nr_unused_locks */ + if (class->usage_mask == 0) + debug_atomic_dec(nr_unused_locks); nr_lock_classes--; __clear_bit(class - lock_classes, lock_classes_in_use); if (class - lock_classes == max_lock_class_idx) diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 26ea5d04f56c..cf0da0f844f9 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -131,6 +131,7 @@ comment "Do not forget to sign required modules with scripts/sign-file" choice prompt "Which hash algorithm should modules be signed with?" depends on MODULE_SIG || IMA_APPRAISE_MODSIG + default MODULE_SIG_SHA512 help This determines which sort of hashing algorithm will be used during signature generation. This algorithm _must_ be built into the kernel diff --git a/kernel/padata.c b/kernel/padata.c index e2bef90e6fd0..b3f79f23060c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -350,7 +350,8 @@ static void padata_reorder(struct parallel_data *pd) * To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish. */ padata_get_pd(pd); - queue_work(pinst->serial_wq, &pd->reorder_work); + if (!queue_work(pinst->serial_wq, &pd->reorder_work)) + padata_put_pd(pd); } } diff --git a/kernel/params.c b/kernel/params.c index 5b92310425c5..27954dfe3d20 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -945,7 +945,9 @@ int module_sysfs_initialized; static void module_kobj_release(struct kobject *kobj) { struct module_kobject *mk = to_module_kobject(kobj); - complete(mk->kobj_completion); + + if (mk->kobj_completion) + complete(mk->kobj_completion); } struct kobj_type module_ktype = { diff --git a/kernel/pid.c b/kernel/pid.c index 74834c04a081..8bce3aebc949 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -594,20 +594,15 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) */ int pidfd_create(struct pid *pid, unsigned int flags) { - int fd; - - if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) - return -EINVAL; + int pidfd; + struct file *pidfd_file; - if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) - return -EINVAL; - - fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), - flags | O_RDWR | O_CLOEXEC); - if (fd < 0) - put_pid(pid); + pidfd = pidfd_prepare(pid, flags, &pidfd_file); + if (pidfd < 0) + return pidfd; - return fd; + fd_install(pidfd, pidfd_file); + return pidfd; } /** diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 044026abfdd7..3929ef8148c1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -821,8 +821,17 @@ void rcu_read_unlock_strict(void) { struct rcu_data *rdp; - if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) + if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread) return; + + /* + * rcu_report_qs_rdp() can only be invoked with a stable rdp and + * from the local CPU. + * + * The in_atomic_preempt_off() check ensures that we come here holding + * the last preempt_count (which will get dropped once we return to + * __rcu_read_unlock(). + */ rdp = this_cpu_ptr(&rcu_data); rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); @@ -963,13 +972,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) */ static void rcu_flavor_sched_clock_irq(int user) { - if (user || rcu_is_cpu_rrupt_from_idle()) { + if (user || rcu_is_cpu_rrupt_from_idle() || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) && + (preempt_count() == HARDIRQ_OFFSET))) { /* * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so note it. + * mode, from the idle loop without this being a nested + * interrupt, or while not holding the task preempt count + * (with PREEMPT_COUNT=y). In this case, the CPU is in a + * quiescent state, so note it. * * No memory barrier is required here because rcu_qs() * references only CPU-local variables that other CPUs diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 542c0e82a900..4dcb48973318 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -84,7 +84,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) if (unlikely(sg_policy->limits_changed)) { sg_policy->limits_changed = false; - sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + sg_policy->need_freq_update = true; return true; } @@ -96,10 +96,22 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->need_freq_update) + if (sg_policy->need_freq_update) { sg_policy->need_freq_update = false; - else if (sg_policy->next_freq == next_freq) + /* + * The policy limits have changed, but if the return value of + * cpufreq_driver_resolve_freq() after applying the new limits + * is still equal to the previously selected frequency, the + * driver callback need not be invoked unless the driver + * specifically wants that to happen on every update of the + * policy limits. + */ + if (sg_policy->next_freq == next_freq && + !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) + return false; + } else if (sg_policy->next_freq == next_freq) { return false; + } sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; diff --git a/kernel/softirq.c b/kernel/softirq.c index 6665f5cd60cb..9ab5ca399a99 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -140,6 +140,18 @@ static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = { .lock = INIT_LOCAL_LOCK(softirq_ctrl.lock), }; +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key bh_lock_key; +struct lockdep_map bh_lock_map = { + .name = "local_bh", + .key = &bh_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT makes BH preemptible. */ + .lock_type = LD_LOCK_PERCPU, +}; +EXPORT_SYMBOL_GPL(bh_lock_map); +#endif + /** * local_bh_blocked() - Check for idle whether BH processing is blocked * @@ -162,6 +174,8 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) WARN_ON_ONCE(in_hardirq()); + lock_map_acquire_read(&bh_lock_map); + /* First entry of a task into a BH disabled section? */ if (!current->softirq_disable_cnt) { if (preemptible()) { @@ -225,6 +239,8 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) WARN_ON_ONCE(in_hardirq()); lockdep_assert_irqs_enabled(); + lock_map_release(&bh_lock_map); + local_irq_save(flags); curcnt = __this_cpu_read(softirq_ctrl.cnt); @@ -275,6 +291,8 @@ static inline void ksoftirqd_run_begin(void) /* Counterpart to ksoftirqd_run_begin() */ static inline void ksoftirqd_run_end(void) { + /* pairs with the lock_map_acquire_read() in ksoftirqd_run_begin() */ + lock_map_release(&bh_lock_map); __local_bh_enable(SOFTIRQ_OFFSET, true); WARN_ON_ONCE(in_interrupt()); local_irq_enable(); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e60863ab74b5..b3860ec12450 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -58,6 +58,8 @@ #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) +static void retrigger_next_event(void *arg); + /* * The timer bases: * @@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, - } + }, + .csd = CSD_INIT(retrigger_next_event, NULL) }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { @@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_TAI] = HRTIMER_BASE_TAI, }; +static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) +{ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return true; + else + return likely(base->online); +} + /* * Functions and macros which are different for UP/SMP systems are kept in a * single place @@ -177,27 +188,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } /* - * We do not migrate the timer when it is expiring before the next - * event on the target cpu. When high resolution is enabled, we cannot - * reprogram the target cpu hardware and we would cause it to fire - * late. To keep it simple, we handle the high resolution enabled and - * disabled case similar. + * Check if the elected target is suitable considering its next + * event and the hotplug state of the current CPU. + * + * If the elected target is remote and its next event is after the timer + * to queue, then a remote reprogram is necessary. However there is no + * guarantee the IPI handling the operation would arrive in time to meet + * the high resolution deadline. In this case the local CPU becomes a + * preferred target, unless it is offline. + * + * High and low resolution modes are handled the same way for simplicity. * * Called with cpu_base->lock of target cpu held. */ -static int -hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base, + struct hrtimer_cpu_base *new_cpu_base, + struct hrtimer_cpu_base *this_cpu_base) { ktime_t expires; + /* + * The local CPU clockevent can be reprogrammed. Also get_target_base() + * guarantees it is online. + */ + if (new_cpu_base == this_cpu_base) + return true; + + /* + * The offline local CPU can't be the default target if the + * next remote target event is after this timer. Keep the + * elected new base. An IPI will we issued to reprogram + * it as a last resort. + */ + if (!hrtimer_base_is_online(this_cpu_base)) + return true; + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - return expires < new_base->cpu_base->expires_next; + + return expires >= new_base->cpu_base->expires_next; } -static inline -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, - int pinned) +static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { + if (!hrtimer_base_is_online(base)) { + int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); + + return &per_cpu(hrtimer_bases, cpu); + } + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); @@ -248,8 +286,8 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, + this_cpu_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; @@ -258,8 +296,7 @@ again: } WRITE_ONCE(timer->base, new_base); } else { - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { new_cpu_base = this_cpu_base; goto again; } @@ -718,8 +755,6 @@ static inline int hrtimer_is_hres_enabled(void) return hrtimer_hres_enabled; } -static void retrigger_next_event(void *arg); - /* * Switch to high resolution mode */ @@ -1205,6 +1240,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { + struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *new_base; bool force_local, first; @@ -1216,10 +1252,16 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ - force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local = base->cpu_base == this_cpu_base; force_local &= base->cpu_base->next_timer == timer; /* + * Don't force local queuing if this enqueue happens on a unplugged + * CPU after hrtimer_cpu_dying() has been invoked. + */ + force_local &= this_cpu_base->online; + + /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the * remote data correctly. @@ -1248,8 +1290,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } first = enqueue_hrtimer(timer, new_base, mode); - if (!force_local) - return first; + if (!force_local) { + /* + * If the current CPU base is online, then the timer is + * never queued on a remote CPU if it would be the first + * expiring timer there. + */ + if (hrtimer_base_is_online(this_cpu_base)) + return first; + + /* + * Timer was enqueued remote because the current base is + * already offline. If the timer is the first to expire, + * kick the remote CPU to reprogram the clock event. + */ + if (first) { + struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; + + smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); + } + return 0; + } /* * Timer was forced to stay on the current CPU to avoid diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 2d6cf93ca370..fc08d4ccdeeb 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -161,6 +161,7 @@ static int posix_timer_add(struct k_itimer *timer) return id; } spin_unlock(&hash_lock); + cond_resched(); } /* POSIX return code when no timer ID could be allocated */ return -EAGAIN; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ed7d6ad694fb..20a5e6962b69 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -46,7 +46,7 @@ static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { - SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function); + SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, timer->function); SEQ_printf(m, ", S:%02x", timer->state); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", @@ -98,7 +98,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { - SEQ_printf(m, " .base: %pK\n", base); + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6b5bb0e380b5..7254c808b27c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -403,7 +403,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .arg2_type = ARG_CONST_SIZE, }; -static void __set_printk_clr_event(void) +static void __set_printk_clr_event(struct work_struct *work) { /* * This program might be calling bpf_trace_printk, @@ -416,10 +416,11 @@ static void __set_printk_clr_event(void) if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1)) pr_warn_ratelimited("could not enable bpf_trace_printk events"); } +static DECLARE_WORK(set_printk_work, __set_printk_clr_event); const struct bpf_func_proto *bpf_get_trace_printk_proto(void) { - __set_printk_clr_event(); + schedule_work(&set_printk_work); return &bpf_trace_printk_proto; } @@ -462,7 +463,7 @@ static const struct bpf_func_proto bpf_trace_vprintk_proto = { const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void) { - __set_printk_clr_event(); + schedule_work(&set_printk_work); return &bpf_trace_vprintk_proto; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ce8a8eae7e83..bd19c83c0e5e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6522,6 +6522,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) } } } + cond_resched(); } while_for_each_ftrace_rec(); out: mutex_unlock(&ftrace_lock); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index acc176aa1cbe..9e0b9c9a7dff 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3422,10 +3422,9 @@ out_nobuffer: } EXPORT_SYMBOL_GPL(trace_vbprintk); -__printf(3, 0) -static int -__trace_array_vprintk(struct trace_buffer *buffer, - unsigned long ip, const char *fmt, va_list args) +static __printf(3, 0) +int __trace_array_vprintk(struct trace_buffer *buffer, + unsigned long ip, const char *fmt, va_list args) { struct trace_event_call *call = &event_print; struct ring_buffer_event *event; @@ -3478,7 +3477,6 @@ out_nobuffer: return len; } -__printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { @@ -3505,7 +3503,6 @@ int trace_array_vprintk(struct trace_array *tr, * Note, trace_array_init_printk() must be called on @tr before this * can be used. */ -__printf(3, 0) int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { @@ -3550,7 +3547,6 @@ int trace_array_init_printk(struct trace_array *tr) } EXPORT_SYMBOL_GPL(trace_array_init_printk); -__printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...) { @@ -3566,7 +3562,6 @@ int trace_array_printk_buf(struct trace_buffer *buffer, return ret; } -__printf(2, 0) int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(&global_trace, ip, fmt, args); @@ -7054,13 +7049,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), - trace_seq_used(&iter->seq)); + min((size_t)trace_seq_used(&iter->seq), + PAGE_SIZE)); if (ret < 0) { __free_page(spd.pages[i]); break; } spd.partial[i].offset = 0; - spd.partial[i].len = trace_seq_used(&iter->seq); + spd.partial[i].len = ret; trace_seq_init(&iter->seq); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index aad7fcd84617..49b297ca7fc7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -780,13 +780,15 @@ static inline void __init disable_tracing_selftest(const char *reason) extern void *head_page(struct trace_array_cpu *data); extern unsigned long long ns2usecs(u64 nsec); -extern int -trace_vbprintk(unsigned long ip, const char *fmt, va_list args); -extern int -trace_vprintk(unsigned long ip, const char *fmt, va_list args); -extern int -trace_array_vprintk(struct trace_array *tr, - unsigned long ip, const char *fmt, va_list args); + +__printf(2, 0) +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); +__printf(2, 0) +int trace_vprintk(unsigned long ip, const char *fmt, va_list args); +__printf(3, 0) +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args); +__printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 4376887e0d8a..c9b0533407ed 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -16,7 +16,7 @@ #include "trace_output.h" /* for trace_event_sem */ #include "trace_dynevent.h" -static DEFINE_MUTEX(dyn_event_ops_mutex); +DEFINE_MUTEX(dyn_event_ops_mutex); static LIST_HEAD(dyn_event_ops_list); bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call) @@ -125,6 +125,20 @@ out: return ret; } +/* + * Locked version of event creation. The event creation must be protected by + * dyn_event_ops_mutex because of protecting trace_probe_log. + */ +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type) +{ + int ret; + + mutex_lock(&dyn_event_ops_mutex); + ret = type->create(raw_command); + mutex_unlock(&dyn_event_ops_mutex); + return ret; +} + static int create_dyn_event(const char *raw_command) { struct dyn_event_operations *ops; diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 936477a111d3..beee3f8d7544 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -100,6 +100,7 @@ void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos); void dyn_event_seq_stop(struct seq_file *m, void *v); int dyn_events_release_all(struct dyn_event_operations *type); int dyn_event_release(const char *raw_command, struct dyn_event_operations *type); +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type); /* * for_each_dyn_event - iterate over the dyn_event list diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 94bb5f9251b1..8723ad2f9c63 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -456,6 +456,7 @@ static void test_event_printk(struct trace_event_call *call) case '%': continue; case 'p': + do_pointer: /* Find dereferencing fields */ switch (fmt[i + 1]) { case 'B': case 'R': case 'r': @@ -484,6 +485,12 @@ static void test_event_printk(struct trace_event_call *call) continue; if (fmt[i + j] == '*') { star = true; + /* Handle %*pbl case */ + if (!j && fmt[i + 1] == 'p') { + arg++; + i++; + goto do_pointer; + } continue; } if ((fmt[i + j] == 's')) { @@ -776,7 +783,9 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } - call->class->reg(call, TRACE_REG_UNREGISTER, file); + ret = call->class->reg(call, TRACE_REG_UNREGISTER, file); + + WARN_ON_ONCE(ret); } /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ if (file->flags & EVENT_FILE_FL_SOFT_MODE) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 86a0531efd43..23fbd1b10712 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -716,7 +716,7 @@ static __always_inline char *test_string(char *str) kstr = ubuf->buffer; /* For safety, do not trust the string pointer */ - if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) + if (strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE) < 0) return NULL; return kstr; } @@ -735,7 +735,7 @@ static __always_inline char *test_ustring(char *str) /* user space address? */ ustr = (char __user *)str; - if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) + if (strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE) < 0) return NULL; return kstr; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index f941ce01ee35..afdbad16d00a 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1539,7 +1539,7 @@ stacktrace_trigger(struct event_trigger_data *data, struct trace_event_file *file = data->private_data; if (file) - __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP); else trace_dump_stack(STACK_SKIP); } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 69e92c7359fb..44ae51d1dc45 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -561,11 +561,7 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, static __always_inline void trace_stack(struct trace_array *tr) { - unsigned int trace_ctx; - - trace_ctx = tracing_gen_ctx(); - - __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP); + __trace_stack(tr, tracing_gen_ctx_dec(), FTRACE_STACK_SKIP); } static void diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 72655d81b37d..cc155c411768 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -975,7 +975,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_kprobe_ops); - ret = trace_kprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_kprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ba48b5e270e1..3888a59c9dfe 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -143,9 +143,12 @@ fail: } static struct trace_probe_log trace_probe_log; +extern struct mutex dyn_event_ops_mutex; void trace_probe_log_init(const char *subsystem, int argc, const char **argv) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.subsystem = subsystem; trace_probe_log.argc = argc; trace_probe_log.argv = argv; @@ -154,11 +157,15 @@ void trace_probe_log_init(const char *subsystem, int argc, const char **argv) void trace_probe_log_clear(void) { + lockdep_assert_held(&dyn_event_ops_mutex); + memset(&trace_probe_log, 0, sizeof(trace_probe_log)); } void trace_probe_log_set_index(int index) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.index = index; } @@ -167,6 +174,8 @@ void __trace_probe_log_err(int offset, int err_type) char *command, *p; int i, len = 0, pos = 0; + lockdep_assert_held(&dyn_event_ops_mutex); + if (!trace_probe_log.argv) return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index a6a3ff2a441e..53ef3cb65098 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -732,7 +732,7 @@ static int create_or_delete_trace_uprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_uprobe_ops); - ret = trace_uprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_uprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } |