summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c19
-rw-r--r--kernel/bpf/hashtab.c2
-rw-r--r--kernel/bpf/syscall.c21
-rw-r--r--kernel/cgroup/cgroup.c2
-rw-r--r--kernel/dma/contiguous.c3
-rw-r--r--kernel/events/core.c33
-rw-r--r--kernel/events/hw_breakpoint.c5
-rw-r--r--kernel/events/ring_buffer.c1
-rw-r--r--kernel/fork.c98
-rw-r--r--kernel/locking/lockdep.c3
-rw-r--r--kernel/module/Kconfig1
-rw-r--r--kernel/padata.c3
-rw-r--r--kernel/params.c4
-rw-r--r--kernel/pid.c19
-rw-r--r--kernel/rcu/tree_plugin.h22
-rw-r--r--kernel/sched/cpufreq_schedutil.c18
-rw-r--r--kernel/softirq.c18
-rw-r--r--kernel/time/hrtimer.c103
-rw-r--r--kernel/time/posix-timers.c1
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/trace/bpf_trace.c7
-rw-r--r--kernel/trace/ftrace.c1
-rw-r--r--kernel/trace/trace.c16
-rw-r--r--kernel/trace/trace.h16
-rw-r--r--kernel/trace/trace_dynevent.c16
-rw-r--r--kernel/trace/trace_dynevent.h1
-rw-r--r--kernel/trace/trace_events.c11
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_events_trigger.c2
-rw-r--r--kernel/trace/trace_functions.c6
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/trace/trace_probe.c9
-rw-r--r--kernel/trace/trace_uprobe.c2
33 files changed, 351 insertions, 122 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 83b416af4da1..c281f5b8705e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2121,6 +2121,7 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
{
enum bpf_prog_type prog_type = resolve_prog_type(fp);
bool ret;
+ struct bpf_prog_aux *aux = fp->aux;
if (fp->kprobe_override)
return false;
@@ -2132,12 +2133,26 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
*/
map->owner.type = prog_type;
map->owner.jited = fp->jited;
- map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
+ map->owner.xdp_has_frags = aux->xdp_has_frags;
+ map->owner.attach_func_proto = aux->attach_func_proto;
ret = true;
} else {
ret = map->owner.type == prog_type &&
map->owner.jited == fp->jited &&
- map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
+ map->owner.xdp_has_frags == aux->xdp_has_frags;
+ if (ret &&
+ map->owner.attach_func_proto != aux->attach_func_proto) {
+ switch (prog_type) {
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_EXT:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ ret = false;
+ break;
+ default:
+ break;
+ }
+ }
}
spin_unlock(&map->owner.lock);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index dae9ed02a75b..06bc7f26be06 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -2172,7 +2172,7 @@ static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_f
b = &htab->buckets[i];
rcu_read_lock();
head = &b->head;
- hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
+ hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) {
key = elem->key;
if (is_percpu) {
/* current cpu value for percpu map */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7a4004f09bae..b145f3ef3695 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -813,7 +813,7 @@ static const struct vm_operations_struct bpf_map_default_vmops = {
static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct bpf_map *map = filp->private_data;
- int err;
+ int err = 0;
if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
map_value_has_timer(map) || map_value_has_kptrs(map))
@@ -838,7 +838,12 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
err = -EACCES;
goto out;
}
+ bpf_map_write_active_inc(map);
}
+out:
+ mutex_unlock(&map->freeze_mutex);
+ if (err)
+ return err;
/* set default open/close callbacks */
vma->vm_ops = &bpf_map_default_vmops;
@@ -849,13 +854,11 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
vma->vm_flags &= ~VM_MAYWRITE;
err = map->ops->map_mmap(map, vma);
- if (err)
- goto out;
+ if (err) {
+ if (vma->vm_flags & VM_WRITE)
+ bpf_map_write_active_dec(map);
+ }
- if (vma->vm_flags & VM_MAYWRITE)
- bpf_map_write_active_inc(map);
-out:
- mutex_unlock(&map->freeze_mutex);
return err;
}
@@ -4002,6 +4005,8 @@ static int bpf_prog_get_info_by_fd(struct file *file,
info.recursion_misses = stats.misses;
info.verified_insns = prog->aux->verified_insns;
+ if (prog->aux->btf)
+ info.btf_id = btf_obj_id(prog->aux->btf);
if (!bpf_capable()) {
info.jited_prog_len = 0;
@@ -4148,8 +4153,6 @@ static int bpf_prog_get_info_by_fd(struct file *file,
}
}
- if (prog->aux->btf)
- info.btf_id = btf_obj_id(prog->aux->btf);
info.attach_btf_id = prog->aux->attach_btf_id;
if (attach_btf)
info.attach_btf_obj_id = btf_obj_id(attach_btf);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 6e54f0daebef..7997c8021b62 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -90,7 +90,7 @@
DEFINE_MUTEX(cgroup_mutex);
DEFINE_SPINLOCK(css_set_lock);
-#ifdef CONFIG_PROVE_RCU
+#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
#endif
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 6ea80ae42622..24d96a2fe062 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -69,8 +69,7 @@ struct cma *dma_contiguous_default_area;
* Users, who want to set the size of global CMA area for their system
* should use cma= kernel parameter.
*/
-static const phys_addr_t size_bytes __initconst =
- (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M;
+#define size_bytes ((phys_addr_t)CMA_SIZE_MBYTES * SZ_1M)
static phys_addr_t size_cmdline __initdata = -1;
static phys_addr_t base_cmdline __initdata;
static phys_addr_t limit_cmdline __initdata;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8fc2bc5646ee..552bb00bfceb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1180,6 +1180,12 @@ static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
list_del_init(&ctx->active_ctx_list);
}
+static inline void perf_pmu_read(struct perf_event *event)
+{
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ event->pmu->read(event);
+}
+
static void get_ctx(struct perf_event_context *ctx)
{
refcount_inc(&ctx->refcount);
@@ -3389,8 +3395,7 @@ static void __perf_event_sync_stat(struct perf_event *event,
* we know the event must be on the current CPU, therefore we
* don't need to use it.
*/
- if (event->state == PERF_EVENT_STATE_ACTIVE)
- event->pmu->read(event);
+ perf_pmu_read(event);
perf_event_update_time(event);
@@ -4444,15 +4449,8 @@ static void __perf_event_read(void *info)
pmu->read(event);
- for_each_sibling_event(sub, event) {
- if (sub->state == PERF_EVENT_STATE_ACTIVE) {
- /*
- * Use sibling's PMU rather than @event's since
- * sibling could be on different (eg: software) PMU.
- */
- sub->pmu->read(sub);
- }
- }
+ for_each_sibling_event(sub, event)
+ perf_pmu_read(sub);
data->ret = pmu->commit_txn(pmu);
@@ -7101,9 +7099,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
values[n++] = running;
- if ((leader != event) &&
- (leader->state == PERF_EVENT_STATE_ACTIVE))
- leader->pmu->read(leader);
+ if ((leader != event) && !handle->skip_read)
+ perf_pmu_read(leader);
values[n++] = perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
@@ -7116,9 +7113,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
for_each_sibling_event(sub, leader) {
n = 0;
- if ((sub != event) &&
- (sub->state == PERF_EVENT_STATE_ACTIVE))
- sub->pmu->read(sub);
+ if ((sub != event) && !handle->skip_read)
+ perf_pmu_read(sub);
values[n++] = perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
@@ -7173,6 +7169,9 @@ void perf_output_sample(struct perf_output_handle *handle,
{
u64 sample_type = data->type;
+ if (data->sample_flags & PERF_SAMPLE_READ)
+ handle->skip_read = 1;
+
perf_output_put(handle, *header);
if (sample_type & PERF_SAMPLE_IDENTIFIER)
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index c3797701339c..382a3b04f6d3 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -978,9 +978,10 @@ static int hw_breakpoint_event_init(struct perf_event *bp)
return -ENOENT;
/*
- * no branch sampling for breakpoint events
+ * Check if breakpoint type is supported before proceeding.
+ * Also, no branch sampling for breakpoint events.
*/
- if (has_branch_stack(bp))
+ if (!hw_breakpoint_slots_cached(find_slot_idx(bp->attr.bp_type)) || has_branch_stack(bp))
return -EOPNOTSUPP;
err = register_perf_hw_breakpoint(bp);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 3e1655374c2e..4c4894de6e5d 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -181,6 +181,7 @@ __perf_output_begin(struct perf_output_handle *handle,
handle->rb = rb;
handle->event = event;
+ handle->flags = 0;
have_lost = local_read(&rb->lost);
if (unlikely(have_lost)) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 09a935724bd9..8cc313d27188 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1943,6 +1943,91 @@ const struct file_operations pidfd_fops = {
#endif
};
+/**
+ * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+
+ * The helper doesn't perform checks on @pid which makes it useful for pidfds
+ * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
+ * pidfd file are prepared.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
+ */
+static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+ int pidfd;
+ struct file *pidfd_file;
+
+ if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
+ return -EINVAL;
+
+ pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ if (pidfd < 0)
+ return pidfd;
+
+ pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+ flags | O_RDWR | O_CLOEXEC);
+ if (IS_ERR(pidfd_file)) {
+ put_unused_fd(pidfd);
+ return PTR_ERR(pidfd_file);
+ }
+ get_pid(pid); /* held by pidfd_file now */
+ *ret = pidfd_file;
+ return pidfd;
+}
+
+/**
+ * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+ *
+ * The helper verifies that @pid is used as a thread group leader.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
+ */
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+ if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+ return -EINVAL;
+
+ return __pidfd_prepare(pid, flags, ret);
+}
+
static void __delayed_free_task(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
@@ -2293,21 +2378,12 @@ static __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
- retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ /* Note that no task has been attached to @pid yet. */
+ retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
-
pidfd = retval;
- pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
- O_RDWR | O_CLOEXEC);
- if (IS_ERR(pidfile)) {
- put_unused_fd(pidfd);
- retval = PTR_ERR(pidfile);
- goto bad_fork_free_pid;
- }
- get_pid(pid); /* held by pidfile now */
-
retval = put_user(pidfd, args->pidfd);
if (retval)
goto bad_fork_put_pidfd;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7fb9731bc5f5..5f8ce961cd9a 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -6021,6 +6021,9 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
hlist_del_rcu(&class->hash_entry);
WRITE_ONCE(class->key, NULL);
WRITE_ONCE(class->name, NULL);
+ /* Class allocated but not used, -1 in nr_unused_locks */
+ if (class->usage_mask == 0)
+ debug_atomic_dec(nr_unused_locks);
nr_lock_classes--;
__clear_bit(class - lock_classes, lock_classes_in_use);
if (class - lock_classes == max_lock_class_idx)
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 26ea5d04f56c..cf0da0f844f9 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -131,6 +131,7 @@ comment "Do not forget to sign required modules with scripts/sign-file"
choice
prompt "Which hash algorithm should modules be signed with?"
depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ default MODULE_SIG_SHA512
help
This determines which sort of hashing algorithm will be used during
signature generation. This algorithm _must_ be built into the kernel
diff --git a/kernel/padata.c b/kernel/padata.c
index e2bef90e6fd0..b3f79f23060c 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -350,7 +350,8 @@ static void padata_reorder(struct parallel_data *pd)
* To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish.
*/
padata_get_pd(pd);
- queue_work(pinst->serial_wq, &pd->reorder_work);
+ if (!queue_work(pinst->serial_wq, &pd->reorder_work))
+ padata_put_pd(pd);
}
}
diff --git a/kernel/params.c b/kernel/params.c
index 5b92310425c5..27954dfe3d20 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -945,7 +945,9 @@ int module_sysfs_initialized;
static void module_kobj_release(struct kobject *kobj)
{
struct module_kobject *mk = to_module_kobject(kobj);
- complete(mk->kobj_completion);
+
+ if (mk->kobj_completion)
+ complete(mk->kobj_completion);
}
struct kobj_type module_ktype = {
diff --git a/kernel/pid.c b/kernel/pid.c
index 74834c04a081..8bce3aebc949 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -594,20 +594,15 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
*/
int pidfd_create(struct pid *pid, unsigned int flags)
{
- int fd;
-
- if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
- return -EINVAL;
+ int pidfd;
+ struct file *pidfd_file;
- if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
- return -EINVAL;
-
- fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
- flags | O_RDWR | O_CLOEXEC);
- if (fd < 0)
- put_pid(pid);
+ pidfd = pidfd_prepare(pid, flags, &pidfd_file);
+ if (pidfd < 0)
+ return pidfd;
- return fd;
+ fd_install(pidfd, pidfd_file);
+ return pidfd;
}
/**
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 044026abfdd7..3929ef8148c1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -821,8 +821,17 @@ void rcu_read_unlock_strict(void)
{
struct rcu_data *rdp;
- if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
+ if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread)
return;
+
+ /*
+ * rcu_report_qs_rdp() can only be invoked with a stable rdp and
+ * from the local CPU.
+ *
+ * The in_atomic_preempt_off() check ensures that we come here holding
+ * the last preempt_count (which will get dropped once we return to
+ * __rcu_read_unlock().
+ */
rdp = this_cpu_ptr(&rcu_data);
rdp->cpu_no_qs.b.norm = false;
rcu_report_qs_rdp(rdp);
@@ -963,13 +972,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
*/
static void rcu_flavor_sched_clock_irq(int user)
{
- if (user || rcu_is_cpu_rrupt_from_idle()) {
+ if (user || rcu_is_cpu_rrupt_from_idle() ||
+ (IS_ENABLED(CONFIG_PREEMPT_COUNT) &&
+ (preempt_count() == HARDIRQ_OFFSET))) {
/*
* Get here if this CPU took its interrupt from user
- * mode or from the idle loop, and if this is not a
- * nested interrupt. In this case, the CPU is in
- * a quiescent state, so note it.
+ * mode, from the idle loop without this being a nested
+ * interrupt, or while not holding the task preempt count
+ * (with PREEMPT_COUNT=y). In this case, the CPU is in a
+ * quiescent state, so note it.
*
* No memory barrier is required here because rcu_qs()
* references only CPU-local variables that other CPUs
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 542c0e82a900..4dcb48973318 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -84,7 +84,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
if (unlikely(sg_policy->limits_changed)) {
sg_policy->limits_changed = false;
- sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
+ sg_policy->need_freq_update = true;
return true;
}
@@ -96,10 +96,22 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
- if (sg_policy->need_freq_update)
+ if (sg_policy->need_freq_update) {
sg_policy->need_freq_update = false;
- else if (sg_policy->next_freq == next_freq)
+ /*
+ * The policy limits have changed, but if the return value of
+ * cpufreq_driver_resolve_freq() after applying the new limits
+ * is still equal to the previously selected frequency, the
+ * driver callback need not be invoked unless the driver
+ * specifically wants that to happen on every update of the
+ * policy limits.
+ */
+ if (sg_policy->next_freq == next_freq &&
+ !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS))
+ return false;
+ } else if (sg_policy->next_freq == next_freq) {
return false;
+ }
sg_policy->next_freq = next_freq;
sg_policy->last_freq_update_time = time;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 6665f5cd60cb..9ab5ca399a99 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -140,6 +140,18 @@ static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = {
.lock = INIT_LOCAL_LOCK(softirq_ctrl.lock),
};
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key bh_lock_key;
+struct lockdep_map bh_lock_map = {
+ .name = "local_bh",
+ .key = &bh_lock_key,
+ .wait_type_outer = LD_WAIT_FREE,
+ .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT makes BH preemptible. */
+ .lock_type = LD_LOCK_PERCPU,
+};
+EXPORT_SYMBOL_GPL(bh_lock_map);
+#endif
+
/**
* local_bh_blocked() - Check for idle whether BH processing is blocked
*
@@ -162,6 +174,8 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
WARN_ON_ONCE(in_hardirq());
+ lock_map_acquire_read(&bh_lock_map);
+
/* First entry of a task into a BH disabled section? */
if (!current->softirq_disable_cnt) {
if (preemptible()) {
@@ -225,6 +239,8 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
WARN_ON_ONCE(in_hardirq());
lockdep_assert_irqs_enabled();
+ lock_map_release(&bh_lock_map);
+
local_irq_save(flags);
curcnt = __this_cpu_read(softirq_ctrl.cnt);
@@ -275,6 +291,8 @@ static inline void ksoftirqd_run_begin(void)
/* Counterpart to ksoftirqd_run_begin() */
static inline void ksoftirqd_run_end(void)
{
+ /* pairs with the lock_map_acquire_read() in ksoftirqd_run_begin() */
+ lock_map_release(&bh_lock_map);
__local_bh_enable(SOFTIRQ_OFFSET, true);
WARN_ON_ONCE(in_interrupt());
local_irq_enable();
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index e60863ab74b5..b3860ec12450 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -58,6 +58,8 @@
#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
+static void retrigger_next_event(void *arg);
+
/*
* The timer bases:
*
@@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
},
- }
+ },
+ .csd = CSD_INIT(retrigger_next_event, NULL)
};
static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
@@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
[CLOCK_TAI] = HRTIMER_BASE_TAI,
};
+static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
+{
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return true;
+ else
+ return likely(base->online);
+}
+
/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
@@ -177,27 +188,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
}
/*
- * We do not migrate the timer when it is expiring before the next
- * event on the target cpu. When high resolution is enabled, we cannot
- * reprogram the target cpu hardware and we would cause it to fire
- * late. To keep it simple, we handle the high resolution enabled and
- * disabled case similar.
+ * Check if the elected target is suitable considering its next
+ * event and the hotplug state of the current CPU.
+ *
+ * If the elected target is remote and its next event is after the timer
+ * to queue, then a remote reprogram is necessary. However there is no
+ * guarantee the IPI handling the operation would arrive in time to meet
+ * the high resolution deadline. In this case the local CPU becomes a
+ * preferred target, unless it is offline.
+ *
+ * High and low resolution modes are handled the same way for simplicity.
*
* Called with cpu_base->lock of target cpu held.
*/
-static int
-hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
+static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base,
+ struct hrtimer_cpu_base *new_cpu_base,
+ struct hrtimer_cpu_base *this_cpu_base)
{
ktime_t expires;
+ /*
+ * The local CPU clockevent can be reprogrammed. Also get_target_base()
+ * guarantees it is online.
+ */
+ if (new_cpu_base == this_cpu_base)
+ return true;
+
+ /*
+ * The offline local CPU can't be the default target if the
+ * next remote target event is after this timer. Keep the
+ * elected new base. An IPI will we issued to reprogram
+ * it as a last resort.
+ */
+ if (!hrtimer_base_is_online(this_cpu_base))
+ return true;
+
expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
- return expires < new_base->cpu_base->expires_next;
+
+ return expires >= new_base->cpu_base->expires_next;
}
-static inline
-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
- int pinned)
+static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned)
{
+ if (!hrtimer_base_is_online(base)) {
+ int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
+
+ return &per_cpu(hrtimer_bases, cpu);
+ }
+
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
if (static_branch_likely(&timers_migration_enabled) && !pinned)
return &per_cpu(hrtimer_bases, get_nohz_timer_target());
@@ -248,8 +286,8 @@ again:
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
- if (new_cpu_base != this_cpu_base &&
- hrtimer_check_target(timer, new_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
+ this_cpu_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
new_cpu_base = this_cpu_base;
@@ -258,8 +296,7 @@ again:
}
WRITE_ONCE(timer->base, new_base);
} else {
- if (new_cpu_base != this_cpu_base &&
- hrtimer_check_target(timer, new_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) {
new_cpu_base = this_cpu_base;
goto again;
}
@@ -718,8 +755,6 @@ static inline int hrtimer_is_hres_enabled(void)
return hrtimer_hres_enabled;
}
-static void retrigger_next_event(void *arg);
-
/*
* Switch to high resolution mode
*/
@@ -1205,6 +1240,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
u64 delta_ns, const enum hrtimer_mode mode,
struct hrtimer_clock_base *base)
{
+ struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
struct hrtimer_clock_base *new_base;
bool force_local, first;
@@ -1216,10 +1252,16 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* and enforce reprogramming after it is queued no matter whether
* it is the new first expiring timer again or not.
*/
- force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+ force_local = base->cpu_base == this_cpu_base;
force_local &= base->cpu_base->next_timer == timer;
/*
+ * Don't force local queuing if this enqueue happens on a unplugged
+ * CPU after hrtimer_cpu_dying() has been invoked.
+ */
+ force_local &= this_cpu_base->online;
+
+ /*
* Remove an active timer from the queue. In case it is not queued
* on the current CPU, make sure that remove_hrtimer() updates the
* remote data correctly.
@@ -1248,8 +1290,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
}
first = enqueue_hrtimer(timer, new_base, mode);
- if (!force_local)
- return first;
+ if (!force_local) {
+ /*
+ * If the current CPU base is online, then the timer is
+ * never queued on a remote CPU if it would be the first
+ * expiring timer there.
+ */
+ if (hrtimer_base_is_online(this_cpu_base))
+ return first;
+
+ /*
+ * Timer was enqueued remote because the current base is
+ * already offline. If the timer is the first to expire,
+ * kick the remote CPU to reprogram the clock event.
+ */
+ if (first) {
+ struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;
+
+ smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
+ }
+ return 0;
+ }
/*
* Timer was forced to stay on the current CPU to avoid
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 2d6cf93ca370..fc08d4ccdeeb 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -161,6 +161,7 @@ static int posix_timer_add(struct k_itimer *timer)
return id;
}
spin_unlock(&hash_lock);
+ cond_resched();
}
/* POSIX return code when no timer ID could be allocated */
return -EAGAIN;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ed7d6ad694fb..20a5e6962b69 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -46,7 +46,7 @@ static void
print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
int idx, u64 now)
{
- SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function);
+ SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, timer->function);
SEQ_printf(m, ", S:%02x", timer->state);
SEQ_printf(m, "\n");
SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
@@ -98,7 +98,7 @@ next_one:
static void
print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
{
- SEQ_printf(m, " .base: %pK\n", base);
+ SEQ_printf(m, " .base: %p\n", base);
SEQ_printf(m, " .index: %d\n", base->index);
SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 6b5bb0e380b5..7254c808b27c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -403,7 +403,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
.arg2_type = ARG_CONST_SIZE,
};
-static void __set_printk_clr_event(void)
+static void __set_printk_clr_event(struct work_struct *work)
{
/*
* This program might be calling bpf_trace_printk,
@@ -416,10 +416,11 @@ static void __set_printk_clr_event(void)
if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1))
pr_warn_ratelimited("could not enable bpf_trace_printk events");
}
+static DECLARE_WORK(set_printk_work, __set_printk_clr_event);
const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
{
- __set_printk_clr_event();
+ schedule_work(&set_printk_work);
return &bpf_trace_printk_proto;
}
@@ -462,7 +463,7 @@ static const struct bpf_func_proto bpf_trace_vprintk_proto = {
const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void)
{
- __set_printk_clr_event();
+ schedule_work(&set_printk_work);
return &bpf_trace_vprintk_proto;
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ce8a8eae7e83..bd19c83c0e5e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6522,6 +6522,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
}
}
}
+ cond_resched();
} while_for_each_ftrace_rec();
out:
mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index acc176aa1cbe..9e0b9c9a7dff 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3422,10 +3422,9 @@ out_nobuffer:
}
EXPORT_SYMBOL_GPL(trace_vbprintk);
-__printf(3, 0)
-static int
-__trace_array_vprintk(struct trace_buffer *buffer,
- unsigned long ip, const char *fmt, va_list args)
+static __printf(3, 0)
+int __trace_array_vprintk(struct trace_buffer *buffer,
+ unsigned long ip, const char *fmt, va_list args)
{
struct trace_event_call *call = &event_print;
struct ring_buffer_event *event;
@@ -3478,7 +3477,6 @@ out_nobuffer:
return len;
}
-__printf(3, 0)
int trace_array_vprintk(struct trace_array *tr,
unsigned long ip, const char *fmt, va_list args)
{
@@ -3505,7 +3503,6 @@ int trace_array_vprintk(struct trace_array *tr,
* Note, trace_array_init_printk() must be called on @tr before this
* can be used.
*/
-__printf(3, 0)
int trace_array_printk(struct trace_array *tr,
unsigned long ip, const char *fmt, ...)
{
@@ -3550,7 +3547,6 @@ int trace_array_init_printk(struct trace_array *tr)
}
EXPORT_SYMBOL_GPL(trace_array_init_printk);
-__printf(3, 4)
int trace_array_printk_buf(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, ...)
{
@@ -3566,7 +3562,6 @@ int trace_array_printk_buf(struct trace_buffer *buffer,
return ret;
}
-__printf(2, 0)
int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
{
return trace_array_vprintk(&global_trace, ip, fmt, args);
@@ -7054,13 +7049,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
/* Copy the data into the page, so we can start over. */
ret = trace_seq_to_buffer(&iter->seq,
page_address(spd.pages[i]),
- trace_seq_used(&iter->seq));
+ min((size_t)trace_seq_used(&iter->seq),
+ PAGE_SIZE));
if (ret < 0) {
__free_page(spd.pages[i]);
break;
}
spd.partial[i].offset = 0;
- spd.partial[i].len = trace_seq_used(&iter->seq);
+ spd.partial[i].len = ret;
trace_seq_init(&iter->seq);
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index aad7fcd84617..49b297ca7fc7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -780,13 +780,15 @@ static inline void __init disable_tracing_selftest(const char *reason)
extern void *head_page(struct trace_array_cpu *data);
extern unsigned long long ns2usecs(u64 nsec);
-extern int
-trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
-extern int
-trace_vprintk(unsigned long ip, const char *fmt, va_list args);
-extern int
-trace_array_vprintk(struct trace_array *tr,
- unsigned long ip, const char *fmt, va_list args);
+
+__printf(2, 0)
+int trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
+__printf(2, 0)
+int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
+__printf(3, 0)
+int trace_array_vprintk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, va_list args);
+__printf(3, 4)
int trace_array_printk_buf(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, ...);
void trace_printk_seq(struct trace_seq *s);
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index 4376887e0d8a..c9b0533407ed 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -16,7 +16,7 @@
#include "trace_output.h" /* for trace_event_sem */
#include "trace_dynevent.h"
-static DEFINE_MUTEX(dyn_event_ops_mutex);
+DEFINE_MUTEX(dyn_event_ops_mutex);
static LIST_HEAD(dyn_event_ops_list);
bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call)
@@ -125,6 +125,20 @@ out:
return ret;
}
+/*
+ * Locked version of event creation. The event creation must be protected by
+ * dyn_event_ops_mutex because of protecting trace_probe_log.
+ */
+int dyn_event_create(const char *raw_command, struct dyn_event_operations *type)
+{
+ int ret;
+
+ mutex_lock(&dyn_event_ops_mutex);
+ ret = type->create(raw_command);
+ mutex_unlock(&dyn_event_ops_mutex);
+ return ret;
+}
+
static int create_dyn_event(const char *raw_command)
{
struct dyn_event_operations *ops;
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
index 936477a111d3..beee3f8d7544 100644
--- a/kernel/trace/trace_dynevent.h
+++ b/kernel/trace/trace_dynevent.h
@@ -100,6 +100,7 @@ void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos);
void dyn_event_seq_stop(struct seq_file *m, void *v);
int dyn_events_release_all(struct dyn_event_operations *type);
int dyn_event_release(const char *raw_command, struct dyn_event_operations *type);
+int dyn_event_create(const char *raw_command, struct dyn_event_operations *type);
/*
* for_each_dyn_event - iterate over the dyn_event list
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 94bb5f9251b1..8723ad2f9c63 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -456,6 +456,7 @@ static void test_event_printk(struct trace_event_call *call)
case '%':
continue;
case 'p':
+ do_pointer:
/* Find dereferencing fields */
switch (fmt[i + 1]) {
case 'B': case 'R': case 'r':
@@ -484,6 +485,12 @@ static void test_event_printk(struct trace_event_call *call)
continue;
if (fmt[i + j] == '*') {
star = true;
+ /* Handle %*pbl case */
+ if (!j && fmt[i + 1] == 'p') {
+ arg++;
+ i++;
+ goto do_pointer;
+ }
continue;
}
if ((fmt[i + j] == 's')) {
@@ -776,7 +783,9 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
}
- call->class->reg(call, TRACE_REG_UNREGISTER, file);
+ ret = call->class->reg(call, TRACE_REG_UNREGISTER, file);
+
+ WARN_ON_ONCE(ret);
}
/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
if (file->flags & EVENT_FILE_FL_SOFT_MODE)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 86a0531efd43..23fbd1b10712 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -716,7 +716,7 @@ static __always_inline char *test_string(char *str)
kstr = ubuf->buffer;
/* For safety, do not trust the string pointer */
- if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE))
+ if (strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE) < 0)
return NULL;
return kstr;
}
@@ -735,7 +735,7 @@ static __always_inline char *test_ustring(char *str)
/* user space address? */
ustr = (char __user *)str;
- if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE))
+ if (strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE) < 0)
return NULL;
return kstr;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index f941ce01ee35..afdbad16d00a 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1539,7 +1539,7 @@ stacktrace_trigger(struct event_trigger_data *data,
struct trace_event_file *file = data->private_data;
if (file)
- __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP);
+ __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP);
else
trace_dump_stack(STACK_SKIP);
}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 69e92c7359fb..44ae51d1dc45 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -561,11 +561,7 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
static __always_inline void trace_stack(struct trace_array *tr)
{
- unsigned int trace_ctx;
-
- trace_ctx = tracing_gen_ctx();
-
- __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP);
+ __trace_stack(tr, tracing_gen_ctx_dec(), FTRACE_STACK_SKIP);
}
static void
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 72655d81b37d..cc155c411768 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -975,7 +975,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command)
if (raw_command[0] == '-')
return dyn_event_release(raw_command, &trace_kprobe_ops);
- ret = trace_kprobe_create(raw_command);
+ ret = dyn_event_create(raw_command, &trace_kprobe_ops);
return ret == -ECANCELED ? -EINVAL : ret;
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index ba48b5e270e1..3888a59c9dfe 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -143,9 +143,12 @@ fail:
}
static struct trace_probe_log trace_probe_log;
+extern struct mutex dyn_event_ops_mutex;
void trace_probe_log_init(const char *subsystem, int argc, const char **argv)
{
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
trace_probe_log.subsystem = subsystem;
trace_probe_log.argc = argc;
trace_probe_log.argv = argv;
@@ -154,11 +157,15 @@ void trace_probe_log_init(const char *subsystem, int argc, const char **argv)
void trace_probe_log_clear(void)
{
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
memset(&trace_probe_log, 0, sizeof(trace_probe_log));
}
void trace_probe_log_set_index(int index)
{
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
trace_probe_log.index = index;
}
@@ -167,6 +174,8 @@ void __trace_probe_log_err(int offset, int err_type)
char *command, *p;
int i, len = 0, pos = 0;
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
if (!trace_probe_log.argv)
return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a6a3ff2a441e..53ef3cb65098 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -732,7 +732,7 @@ static int create_or_delete_trace_uprobe(const char *raw_command)
if (raw_command[0] == '-')
return dyn_event_release(raw_command, &trace_uprobe_ops);
- ret = trace_uprobe_create(raw_command);
+ ret = dyn_event_create(raw_command, &trace_uprobe_ops);
return ret == -ECANCELED ? -EINVAL : ret;
}