summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit_watch.c16
-rw-r--r--kernel/bpf/hashtab.c2
-rw-r--r--kernel/bpf/inode.c2
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c1
-rw-r--r--kernel/bpf/queue_stack_maps.c35
-rw-r--r--kernel/bpf/ringbuf.c17
-rw-r--r--kernel/bpf/rqspinlock.c2
-rw-r--r--kernel/bpf/syscall.c6
-rw-r--r--kernel/cgroup/cgroup.c37
-rw-r--r--kernel/cgroup/cpuset-internal.h1
-rw-r--r--kernel/cgroup/cpuset.c407
-rw-r--r--kernel/cgroup/rstat.c3
-rw-r--r--kernel/cpu.c5
-rw-r--r--kernel/dma/coherent.c12
-rw-r--r--kernel/dma/contiguous.c3
-rw-r--r--kernel/dma/mapping.c25
-rw-r--r--kernel/entry/common.c49
-rw-r--r--kernel/events/core.c675
-rw-r--r--kernel/events/ring_buffer.c29
-rw-r--r--kernel/events/uprobes.c15
-rw-r--r--kernel/exit.c12
-rw-r--r--kernel/fork.c121
-rw-r--r--kernel/futex/core.c803
-rw-r--r--kernel/futex/futex.h74
-rw-r--r--kernel/futex/pi.c308
-rw-r--r--kernel/futex/requeue.c460
-rw-r--r--kernel/futex/waitwake.c207
-rw-r--r--kernel/irq/msi.c2
-rw-r--r--kernel/livepatch/transition.c49
-rw-r--r--kernel/locking/lockdep.c76
-rw-r--r--kernel/locking/lockdep_internals.h1
-rw-r--r--kernel/locking/lockdep_proc.c2
-rw-r--r--kernel/locking/percpu-rwsem.c13
-rw-r--r--kernel/module/Kconfig5
-rw-r--r--kernel/module/main.c1
-rw-r--r--kernel/nsproxy.c30
-rw-r--r--kernel/padata.c3
-rw-r--r--kernel/params.c51
-rw-r--r--kernel/pid.c6
-rw-r--r--kernel/power/hibernate.c16
-rw-r--r--kernel/power/main.c31
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/suspend.c7
-rw-r--r--kernel/power/swap.c103
-rw-r--r--kernel/rcu/rcu.h18
-rw-r--r--kernel/rcu/rcuscale.c2
-rw-r--r--kernel/rcu/rcutorture.c206
-rw-r--r--kernel/rcu/srcutree.c2
-rw-r--r--kernel/rcu/tree.c84
-rw-r--r--kernel/rcu/tree.h3
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_nocb.h10
-rw-r--r--kernel/rcu/tree_plugin.h2
-rw-r--r--kernel/rcu/tree_stall.h4
-rw-r--r--kernel/sched/core.c148
-rw-r--r--kernel/sched/cpufreq_schedutil.c49
-rw-r--r--kernel/sched/debug.c4
-rw-r--r--kernel/sched/ext.c241
-rw-r--r--kernel/sched/ext_idle.c2
-rw-r--r--kernel/sched/fair.c37
-rw-r--r--kernel/sched/isolation.c2
-rw-r--r--kernel/sched/rt.c105
-rw-r--r--kernel/sched/sched.h34
-rw-r--r--kernel/sched/syscalls.c5
-rw-r--r--kernel/sched/topology.c129
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/time/hrtimer.c2
-rw-r--r--kernel/time/tick-common.c22
-rw-r--r--kernel/time/timekeeping.c50
-rw-r--r--kernel/time/vsyscall.c4
-rw-r--r--kernel/trace/blktrace.c11
-rw-r--r--kernel/trace/fprobe.c171
-rw-r--r--kernel/trace/ftrace.c337
-rw-r--r--kernel/trace/ring_buffer.c8
-rw-r--r--kernel/trace/rv/rv.c7
-rw-r--r--kernel/trace/trace.c16
-rw-r--r--kernel/trace/trace_dynevent.c16
-rw-r--r--kernel/trace/trace_dynevent.h1
-rw-r--r--kernel/trace/trace_entries.h4
-rw-r--r--kernel/trace/trace_eprobe.c3
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_events_synth.c1
-rw-r--r--kernel/trace/trace_events_trigger.c2
-rw-r--r--kernel/trace/trace_fprobe.c26
-rw-r--r--kernel/trace/trace_functions.c6
-rw-r--r--kernel/trace/trace_functions_graph.c11
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/trace/trace_output.c4
-rw-r--r--kernel/trace/trace_probe.c9
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/vhost_task.c2
91 files changed, 3632 insertions, 1909 deletions
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 367eaf2c78b7..0ebbbe37a60f 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -347,12 +347,17 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
/* Get path information necessary for adding watches. */
static int audit_get_nd(struct audit_watch *watch, struct path *parent)
{
- struct dentry *d = kern_path_locked(watch->path, parent);
+ struct dentry *d;
+
+ d = kern_path_locked_negative(watch->path, parent);
if (IS_ERR(d))
return PTR_ERR(d);
- /* update watch filter fields */
- watch->dev = d->d_sb->s_dev;
- watch->ino = d_backing_inode(d)->i_ino;
+
+ if (d_is_positive(d)) {
+ /* update watch filter fields */
+ watch->dev = d->d_sb->s_dev;
+ watch->ino = d_backing_inode(d)->i_ino;
+ }
inode_unlock(d_backing_inode(parent->dentry));
dput(d);
@@ -418,11 +423,10 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
/* caller expects mutex locked */
mutex_lock(&audit_filter_mutex);
- if (ret && ret != -ENOENT) {
+ if (ret) {
audit_put_watch(watch);
return ret;
}
- ret = 0;
/* either find an old parent or attach a new one */
parent = audit_find_parent(d_backing_inode(parent_path.dentry));
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 5a5adc66b8e2..92b606d60020 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -2189,7 +2189,7 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
b = &htab->buckets[i];
rcu_read_lock();
head = &b->head;
- hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
+ hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) {
key = elem->key;
if (is_percpu) {
/* current cpu value for percpu map */
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index dc3aa91a6ba0..5c2e96b19392 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -421,7 +421,7 @@ static int bpf_iter_link_pin_kernel(struct dentry *parent,
int ret;
inode_lock(parent->d_inode);
- dentry = lookup_one_len(name, parent, strlen(name));
+ dentry = lookup_noperm(&QSTR(name), parent);
if (IS_ERR(dentry)) {
inode_unlock(parent->d_inode);
return PTR_ERR(dentry);
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index 2fdf3c978db1..774e5a538811 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -89,5 +89,6 @@ static void __exit fini(void)
}
late_initcall(load);
module_exit(fini);
+MODULE_IMPORT_NS("BPF_INTERNAL");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Embedded BPF programs for introspection in bpffs");
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index d869f51ea93a..9a5f94371e50 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -9,13 +9,14 @@
#include <linux/slab.h>
#include <linux/btf_ids.h>
#include "percpu_freelist.h"
+#include <asm/rqspinlock.h>
#define QUEUE_STACK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
struct bpf_queue_stack {
struct bpf_map map;
- raw_spinlock_t lock;
+ rqspinlock_t lock;
u32 head, tail;
u32 size; /* max_entries + 1 */
@@ -78,7 +79,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
qs->size = size;
- raw_spin_lock_init(&qs->lock);
+ raw_res_spin_lock_init(&qs->lock);
return &qs->map;
}
@@ -98,12 +99,8 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
int err = 0;
void *ptr;
- if (in_nmi()) {
- if (!raw_spin_trylock_irqsave(&qs->lock, flags))
- return -EBUSY;
- } else {
- raw_spin_lock_irqsave(&qs->lock, flags);
- }
+ if (raw_res_spin_lock_irqsave(&qs->lock, flags))
+ return -EBUSY;
if (queue_stack_map_is_empty(qs)) {
memset(value, 0, qs->map.value_size);
@@ -120,7 +117,7 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
}
out:
- raw_spin_unlock_irqrestore(&qs->lock, flags);
+ raw_res_spin_unlock_irqrestore(&qs->lock, flags);
return err;
}
@@ -133,12 +130,8 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
void *ptr;
u32 index;
- if (in_nmi()) {
- if (!raw_spin_trylock_irqsave(&qs->lock, flags))
- return -EBUSY;
- } else {
- raw_spin_lock_irqsave(&qs->lock, flags);
- }
+ if (raw_res_spin_lock_irqsave(&qs->lock, flags))
+ return -EBUSY;
if (queue_stack_map_is_empty(qs)) {
memset(value, 0, qs->map.value_size);
@@ -157,7 +150,7 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
qs->head = index;
out:
- raw_spin_unlock_irqrestore(&qs->lock, flags);
+ raw_res_spin_unlock_irqrestore(&qs->lock, flags);
return err;
}
@@ -203,12 +196,8 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
if (flags & BPF_NOEXIST || flags > BPF_EXIST)
return -EINVAL;
- if (in_nmi()) {
- if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags))
- return -EBUSY;
- } else {
- raw_spin_lock_irqsave(&qs->lock, irq_flags);
- }
+ if (raw_res_spin_lock_irqsave(&qs->lock, irq_flags))
+ return -EBUSY;
if (queue_stack_map_is_full(qs)) {
if (!replace) {
@@ -227,7 +216,7 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
qs->head = 0;
out:
- raw_spin_unlock_irqrestore(&qs->lock, irq_flags);
+ raw_res_spin_unlock_irqrestore(&qs->lock, irq_flags);
return err;
}
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 1499d8caa9a3..719d73299397 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -11,6 +11,7 @@
#include <linux/kmemleak.h>
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
+#include <asm/rqspinlock.h>
#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
@@ -29,7 +30,7 @@ struct bpf_ringbuf {
u64 mask;
struct page **pages;
int nr_pages;
- raw_spinlock_t spinlock ____cacheline_aligned_in_smp;
+ rqspinlock_t spinlock ____cacheline_aligned_in_smp;
/* For user-space producer ring buffers, an atomic_t busy bit is used
* to synchronize access to the ring buffers in the kernel, rather than
* the spinlock that is used for kernel-producer ring buffers. This is
@@ -173,7 +174,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
if (!rb)
return NULL;
- raw_spin_lock_init(&rb->spinlock);
+ raw_res_spin_lock_init(&rb->spinlock);
atomic_set(&rb->busy, 0);
init_waitqueue_head(&rb->waitq);
init_irq_work(&rb->work, bpf_ringbuf_notify);
@@ -416,12 +417,8 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
cons_pos = smp_load_acquire(&rb->consumer_pos);
- if (in_nmi()) {
- if (!raw_spin_trylock_irqsave(&rb->spinlock, flags))
- return NULL;
- } else {
- raw_spin_lock_irqsave(&rb->spinlock, flags);
- }
+ if (raw_res_spin_lock_irqsave(&rb->spinlock, flags))
+ return NULL;
pend_pos = rb->pending_pos;
prod_pos = rb->producer_pos;
@@ -446,7 +443,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
*/
if (new_prod_pos - cons_pos > rb->mask ||
new_prod_pos - pend_pos > rb->mask) {
- raw_spin_unlock_irqrestore(&rb->spinlock, flags);
+ raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);
return NULL;
}
@@ -458,7 +455,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
/* pairs with consumer's smp_load_acquire() */
smp_store_release(&rb->producer_pos, new_prod_pos);
- raw_spin_unlock_irqrestore(&rb->spinlock, flags);
+ raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);
return (void *)hdr + BPF_RINGBUF_HDR_SZ;
}
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index b896c4a75a5c..338305c8852c 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -253,7 +253,7 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
})
#else
#define RES_CHECK_TIMEOUT(ts, ret, mask) \
- ({ (ret) = check_timeout(&(ts)); })
+ ({ (ret) = check_timeout((lock), (mask), &(ts)); })
#endif
/*
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9794446bc8c6..64c3393e8270 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1583,7 +1583,7 @@ struct bpf_map *bpf_map_get(u32 ufd)
return map;
}
-EXPORT_SYMBOL(bpf_map_get);
+EXPORT_SYMBOL_NS(bpf_map_get, "BPF_INTERNAL");
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
@@ -3364,7 +3364,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
bpf_link_inc(link);
return link;
}
-EXPORT_SYMBOL(bpf_link_get_from_fd);
+EXPORT_SYMBOL_NS(bpf_link_get_from_fd, "BPF_INTERNAL");
static void bpf_tracing_link_release(struct bpf_link *link)
{
@@ -6020,7 +6020,7 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
return ____bpf_sys_bpf(cmd, attr, size);
}
}
-EXPORT_SYMBOL(kern_sys_bpf);
+EXPORT_SYMBOL_NS(kern_sys_bpf, "BPF_INTERNAL");
static const struct bpf_func_proto bpf_sys_bpf_proto = {
.func = bpf_sys_bpf,
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 27f08aa17b56..63e5b90da1f3 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -90,7 +90,7 @@
DEFINE_MUTEX(cgroup_mutex);
DEFINE_SPINLOCK(css_set_lock);
-#ifdef CONFIG_PROVE_RCU
+#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
#endif
@@ -2353,9 +2353,37 @@ static struct file_system_type cgroup2_fs_type = {
};
#ifdef CONFIG_CPUSETS_V1
+enum cpuset_param {
+ Opt_cpuset_v2_mode,
+};
+
+static const struct fs_parameter_spec cpuset_fs_parameters[] = {
+ fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
+ {}
+};
+
+static int cpuset_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, cpuset_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_cpuset_v2_mode:
+ ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
+ return 0;
+ }
+ return -EINVAL;
+}
+
static const struct fs_context_operations cpuset_fs_context_ops = {
.get_tree = cgroup1_get_tree,
.free = cgroup_fs_context_free,
+ .parse_param = cpuset_parse_param,
};
/*
@@ -2392,6 +2420,7 @@ static int cpuset_init_fs_context(struct fs_context *fc)
static struct file_system_type cpuset_fs_type = {
.name = "cpuset",
.init_fs_context = cpuset_init_fs_context,
+ .parameters = cpuset_fs_parameters,
.fs_flags = FS_USERNS_MOUNT,
};
#endif
@@ -5923,6 +5952,12 @@ static void kill_css(struct cgroup_subsys_state *css)
if (css->flags & CSS_DYING)
return;
+ /*
+ * Call css_killed(), if defined, before setting the CSS_DYING flag
+ */
+ if (css->ss->css_killed)
+ css->ss->css_killed(css);
+
css->flags |= CSS_DYING;
/*
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index 976a8bc3ff60..383963e28ac6 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -33,6 +33,7 @@ enum prs_errcode {
PERR_CPUSEMPTY,
PERR_HKEEPING,
PERR_ACCESS,
+ PERR_REMOTE,
};
/* bits in struct cpuset flags field */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 39c1fc643d77..24b70ea3e6ce 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -61,10 +61,17 @@ static const char * const perr_strings[] = {
[PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty",
[PERR_HKEEPING] = "partition config conflicts with housekeeping setup",
[PERR_ACCESS] = "Enable partition not permitted",
+ [PERR_REMOTE] = "Have remote partition underneath",
};
/*
- * Exclusive CPUs distributed out to sub-partitions of top_cpuset
+ * For local partitions, update to subpartitions_cpus & isolated_cpus is done
+ * in update_parent_effective_cpumask(). For remote partitions, it is done in
+ * the remote_partition_*() and remote_cpus_update() helpers.
+ */
+/*
+ * Exclusive CPUs distributed out to local or remote sub-partitions of
+ * top_cpuset
*/
static cpumask_var_t subpartitions_cpus;
@@ -86,7 +93,6 @@ static struct list_head remote_children;
* A flag to force sched domain rebuild at the end of an operation.
* It can be set in
* - update_partition_sd_lb()
- * - remote_partition_check()
* - update_cpumasks_hier()
* - cpuset_update_flag()
* - cpuset_hotplug_update_tasks()
@@ -1089,9 +1095,14 @@ void cpuset_reset_sched_domains(void)
*
* Iterate through each task of @cs updating its cpus_allowed to the
* effective cpuset's. As this function is called with cpuset_mutex held,
- * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask()
- * is used instead of effective_cpus to make sure all offline CPUs are also
- * included as hotplug code won't update cpumasks for tasks in top_cpuset.
+ * cpuset membership stays stable.
+ *
+ * For top_cpuset, task_cpu_possible_mask() is used instead of effective_cpus
+ * to make sure all offline CPUs are also included as hotplug code won't
+ * update cpumasks for tasks in top_cpuset.
+ *
+ * As task_cpu_possible_mask() can be task dependent in arm64, we have to
+ * do cpu masking per task instead of doing it once for all.
*/
void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
{
@@ -1105,9 +1116,11 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
if (top_cs) {
/*
- * Percpu kthreads in top_cpuset are ignored
+ * PF_NO_SETAFFINITY tasks are ignored.
+ * All per cpu kthreads should have PF_NO_SETAFFINITY
+ * flag set, see kthread_set_per_cpu().
*/
- if (kthread_is_per_cpu(task))
+ if (task->flags & PF_NO_SETAFFINITY)
continue;
cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
} else {
@@ -1151,7 +1164,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
*
* Return: 0 if successful, an error code otherwise
*/
-static int update_partition_exclusive(struct cpuset *cs, int new_prs)
+static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs)
{
bool exclusive = (new_prs > PRS_MEMBER);
@@ -1234,12 +1247,12 @@ static void reset_partition_data(struct cpuset *cs)
}
/*
- * partition_xcpus_newstate - Exclusive CPUs state change
+ * isolated_cpus_update - Update the isolated_cpus mask
* @old_prs: old partition_root_state
* @new_prs: new partition_root_state
* @xcpus: exclusive CPUs with state change
*/
-static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus)
+static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus)
{
WARN_ON_ONCE(old_prs == new_prs);
if (new_prs == PRS_ISOLATED)
@@ -1273,8 +1286,8 @@ static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
isolcpus_updated = (new_prs != parent->partition_root_state);
if (isolcpus_updated)
- partition_xcpus_newstate(parent->partition_root_state, new_prs,
- xcpus);
+ isolated_cpus_update(parent->partition_root_state, new_prs,
+ xcpus);
cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
return isolcpus_updated;
@@ -1304,8 +1317,8 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
isolcpus_updated = (old_prs != parent->partition_root_state);
if (isolcpus_updated)
- partition_xcpus_newstate(old_prs, parent->partition_root_state,
- xcpus);
+ isolated_cpus_update(old_prs, parent->partition_root_state,
+ xcpus);
cpumask_and(xcpus, xcpus, cpu_active_mask);
cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
@@ -1340,20 +1353,57 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
* compute_effective_exclusive_cpumask - compute effective exclusive CPUs
* @cs: cpuset
* @xcpus: effective exclusive CPUs value to be set
- * Return: true if xcpus is not empty, false otherwise.
+ * @real_cs: the real cpuset (can be NULL)
+ * Return: 0 if there is no sibling conflict, > 0 otherwise
*
- * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
- * it must be a subset of parent's effective_xcpus.
+ * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to
+ * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus
+ * as well. The provision of real_cs means that a cpumask is being changed and
+ * the given cs is a trial one.
*/
-static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
- struct cpumask *xcpus)
+static int compute_effective_exclusive_cpumask(struct cpuset *cs,
+ struct cpumask *xcpus,
+ struct cpuset *real_cs)
{
+ struct cgroup_subsys_state *css;
struct cpuset *parent = parent_cs(cs);
+ struct cpuset *sibling;
+ int retval = 0;
if (!xcpus)
xcpus = cs->effective_xcpus;
- return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);
+ cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);
+
+ if (!real_cs) {
+ if (!cpumask_empty(cs->exclusive_cpus))
+ return 0;
+ } else {
+ cs = real_cs;
+ }
+
+ /*
+ * Exclude exclusive CPUs from siblings
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(sibling, css, parent) {
+ if (sibling == cs)
+ continue;
+
+ if (!cpumask_empty(sibling->exclusive_cpus) &&
+ cpumask_intersects(xcpus, sibling->exclusive_cpus)) {
+ cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus);
+ retval++;
+ continue;
+ }
+ if (!cpumask_empty(sibling->effective_xcpus) &&
+ cpumask_intersects(xcpus, sibling->effective_xcpus)) {
+ cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus);
+ retval++;
+ }
+ }
+ rcu_read_unlock();
+ return retval;
}
static inline bool is_remote_partition(struct cpuset *cs)
@@ -1395,7 +1445,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
* remote partition root underneath it, its exclusive_cpus must
* have overlapped with subpartitions_cpus.
*/
- compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
+ compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL);
if (cpumask_empty(tmp->new_cpus) ||
cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
@@ -1404,8 +1454,11 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
spin_lock_irq(&callback_lock);
isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
list_add(&cs->remote_sibling, &remote_children);
+ cpumask_copy(cs->effective_xcpus, tmp->new_cpus);
spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated);
+ cpuset_force_rebuild();
+ cs->prs_err = 0;
/*
* Propagate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1428,20 +1481,24 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
{
bool isolcpus_updated;
- compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
WARN_ON_ONCE(!is_remote_partition(cs));
- WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
+ WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
spin_lock_irq(&callback_lock);
list_del_init(&cs->remote_sibling);
isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
- NULL, tmp->new_cpus);
- cs->partition_root_state = -cs->partition_root_state;
- if (!cs->prs_err)
- cs->prs_err = PERR_INVCPUS;
+ NULL, cs->effective_xcpus);
+ if (cs->prs_err)
+ cs->partition_root_state = -cs->partition_root_state;
+ else
+ cs->partition_root_state = PRS_MEMBER;
+
+ /* effective_xcpus may need to be changed */
+ compute_effective_exclusive_cpumask(cs, NULL, NULL);
reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated);
+ cpuset_force_rebuild();
/*
* Propagate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1453,14 +1510,15 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
/*
* remote_cpus_update - cpus_exclusive change of remote partition
* @cs: the cpuset to be updated
- * @newmask: the new effective_xcpus mask
+ * @xcpus: the new exclusive_cpus mask, if non-NULL
+ * @excpus: the new effective_xcpus mask
* @tmp: temporary masks
*
* top_cpuset and subpartitions_cpus will be updated or partition can be
* invalidated.
*/
-static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
- struct tmpmasks *tmp)
+static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
+ struct cpumask *excpus, struct tmpmasks *tmp)
{
bool adding, deleting;
int prs = cs->partition_root_state;
@@ -1471,29 +1529,45 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
- if (cpumask_empty(newmask))
+ if (cpumask_empty(excpus)) {
+ cs->prs_err = PERR_CPUSEMPTY;
goto invalidate;
+ }
- adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus);
- deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask);
+ adding = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus);
+ deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus);
/*
* Additions of remote CPUs is only allowed if those CPUs are
* not allocated to other partitions and there are effective_cpus
* left in the top cpuset.
*/
- if (adding && (!capable(CAP_SYS_ADMIN) ||
- cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
- cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)))
- goto invalidate;
+ if (adding) {
+ if (!capable(CAP_SYS_ADMIN))
+ cs->prs_err = PERR_ACCESS;
+ else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
+ cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))
+ cs->prs_err = PERR_NOCPUS;
+ if (cs->prs_err)
+ goto invalidate;
+ }
spin_lock_irq(&callback_lock);
if (adding)
isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);
if (deleting)
isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);
+ /*
+ * Need to update effective_xcpus and exclusive_cpus now as
+ * update_sibling_cpumasks() below may iterate back to the same cs.
+ */
+ cpumask_copy(cs->effective_xcpus, excpus);
+ if (xcpus)
+ cpumask_copy(cs->exclusive_cpus, xcpus);
spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated);
+ if (adding || deleting)
+ cpuset_force_rebuild();
/*
* Propagate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1507,47 +1581,6 @@ invalidate:
}
/*
- * remote_partition_check - check if a child remote partition needs update
- * @cs: the cpuset to be updated
- * @newmask: the new effective_xcpus mask
- * @delmask: temporary mask for deletion (not in tmp)
- * @tmp: temporary masks
- *
- * This should be called before the given cs has updated its cpus_allowed
- * and/or effective_xcpus.
- */
-static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
- struct cpumask *delmask, struct tmpmasks *tmp)
-{
- struct cpuset *child, *next;
- int disable_cnt = 0;
-
- /*
- * Compute the effective exclusive CPUs that will be deleted.
- */
- if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) ||
- !cpumask_intersects(delmask, subpartitions_cpus))
- return; /* No deletion of exclusive CPUs in partitions */
-
- /*
- * Searching the remote children list to look for those that will
- * be impacted by the deletion of exclusive CPUs.
- *
- * Since a cpuset must be removed from the remote children list
- * before it can go offline and holding cpuset_mutex will prevent
- * any change in cpuset status. RCU read lock isn't needed.
- */
- lockdep_assert_held(&cpuset_mutex);
- list_for_each_entry_safe(child, next, &remote_children, remote_sibling)
- if (cpumask_intersects(child->effective_cpus, delmask)) {
- remote_partition_disable(child, tmp);
- disable_cnt++;
- }
- if (disable_cnt)
- cpuset_force_rebuild();
-}
-
-/*
* prstate_housekeeping_conflict - check for partition & housekeeping conflicts
* @prstate: partition root state to be checked
* @new_cpus: cpu mask
@@ -1601,7 +1634,7 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
* The partcmd_update command is used by update_cpumasks_hier() with newmask
* NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
* by update_cpumask() with NULL newmask. In both cases, the callers won't
- * check for error and so partition_root_state and prs_error will be updated
+ * check for error and so partition_root_state and prs_err will be updated
* directly.
*/
static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
@@ -1614,11 +1647,12 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
int old_prs, new_prs;
int part_error = PERR_NONE; /* Partition error? */
int subparts_delta = 0;
- struct cpumask *xcpus; /* cs effective_xcpus */
int isolcpus_updated = 0;
+ struct cpumask *xcpus = user_xcpus(cs);
bool nocpu;
lockdep_assert_held(&cpuset_mutex);
+ WARN_ON_ONCE(is_remote_partition(cs));
/*
* new_prs will only be changed for the partcmd_update and
@@ -1626,7 +1660,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
*/
adding = deleting = false;
old_prs = new_prs = cs->partition_root_state;
- xcpus = user_xcpus(cs);
if (cmd == partcmd_invalidate) {
if (is_prs_invalid(old_prs))
@@ -1661,12 +1694,19 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
/*
+ * Need to call compute_effective_exclusive_cpumask() in case
+ * exclusive_cpus not set. Sibling conflict should only happen
+ * if exclusive_cpus isn't set.
+ */
+ xcpus = tmp->new_cpus;
+ if (compute_effective_exclusive_cpumask(cs, xcpus, NULL))
+ WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus));
+
+ /*
* Enabling partition root is not allowed if its
- * effective_xcpus is empty or doesn't overlap with
- * parent's effective_xcpus.
+ * effective_xcpus is empty.
*/
- if (cpumask_empty(xcpus) ||
- !cpumask_intersects(xcpus, parent->effective_xcpus))
+ if (cpumask_empty(xcpus))
return PERR_INVCPUS;
if (prstate_housekeeping_conflict(new_prs, xcpus))
@@ -1679,19 +1719,22 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if (nocpu)
return PERR_NOCPUS;
- cpumask_copy(tmp->delmask, xcpus);
- deleting = true;
- subparts_delta++;
+ deleting = cpumask_and(tmp->delmask, xcpus, parent->effective_xcpus);
+ if (deleting)
+ subparts_delta++;
new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
} else if (cmd == partcmd_disable) {
/*
- * May need to add cpus to parent's effective_cpus for
- * valid partition root.
+ * May need to add cpus back to parent's effective_cpus
+ * (and maybe removed from subpartitions_cpus/isolated_cpus)
+ * for valid partition root. xcpus may contain CPUs that
+ * shouldn't be removed from the two global cpumasks.
*/
- adding = !is_prs_invalid(old_prs) &&
- cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus);
- if (adding)
+ if (is_partition_valid(cs)) {
+ cpumask_copy(tmp->addmask, cs->effective_xcpus);
+ adding = true;
subparts_delta--;
+ }
new_prs = PRS_MEMBER;
} else if (newmask) {
/*
@@ -1701,6 +1744,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
part_error = PERR_CPUSEMPTY;
goto write_error;
}
+
/* Check newmask again, whether cpus are available for parent/cs */
nocpu |= tasks_nocpu_error(parent, cs, newmask);
@@ -1829,7 +1873,7 @@ write_error:
* CPU lists in cs haven't been updated yet. So defer it to later.
*/
if ((old_prs != new_prs) && (cmd != partcmd_update)) {
- int err = update_partition_exclusive(cs, new_prs);
+ int err = update_partition_exclusive_flag(cs, new_prs);
if (err)
return err;
@@ -1867,7 +1911,7 @@ write_error:
update_unbound_workqueue_cpumask(isolcpus_updated);
if ((old_prs != new_prs) && (cmd == partcmd_update))
- update_partition_exclusive(cs, new_prs);
+ update_partition_exclusive_flag(cs, new_prs);
if (adding || deleting) {
cpuset_update_tasks_cpumask(parent, tmp->addmask);
@@ -1917,7 +1961,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
* 2) All the effective_cpus will be used up and cp
* has tasks
*/
- compute_effective_exclusive_cpumask(cs, new_ecpus);
+ compute_effective_exclusive_cpumask(cs, new_ecpus, NULL);
cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
rcu_read_lock();
@@ -1925,6 +1969,11 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
if (!is_partition_valid(child))
continue;
+ /*
+ * There shouldn't be a remote partition underneath another
+ * partition root.
+ */
+ WARN_ON_ONCE(is_remote_partition(child));
child->prs_err = 0;
if (!cpumask_subset(child->effective_xcpus,
cs->effective_xcpus))
@@ -1980,32 +2029,39 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
bool remote = is_remote_partition(cp);
bool update_parent = false;
+ old_prs = new_prs = cp->partition_root_state;
+
/*
- * Skip descendent remote partition that acquires CPUs
- * directly from top cpuset unless it is cs.
+ * For child remote partition root (!= cs), we need to call
+ * remote_cpus_update() if effective_xcpus will be changed.
+ * Otherwise, we can skip the whole subtree.
+ *
+ * remote_cpus_update() will reuse tmp->new_cpus only after
+ * its value is being processed.
*/
if (remote && (cp != cs)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
+ compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL);
+ if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ rcu_read_unlock();
+ remote_cpus_update(cp, NULL, tmp->new_cpus, tmp);
+ rcu_read_lock();
- /*
- * Update effective_xcpus if exclusive_cpus set.
- * The case when exclusive_cpus isn't set is handled later.
- */
- if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) {
- spin_lock_irq(&callback_lock);
- compute_effective_exclusive_cpumask(cp, NULL);
- spin_unlock_irq(&callback_lock);
+ /* Remote partition may be invalidated */
+ new_prs = cp->partition_root_state;
+ remote = (new_prs == old_prs);
}
- old_prs = new_prs = cp->partition_root_state;
- if (remote || (is_partition_valid(parent) &&
- is_partition_valid(cp)))
+ if (remote || (is_partition_valid(parent) && is_partition_valid(cp)))
compute_partition_effective_cpumask(cp, tmp->new_cpus);
else
compute_effective_cpumask(tmp->new_cpus, cp, parent);
+ if (remote)
+ goto get_css; /* Ready to update cpuset data */
+
/*
* A partition with no effective_cpus is allowed as long as
* there is no task associated with it. Call
@@ -2025,9 +2081,6 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus))
cpumask_copy(tmp->new_cpus, parent->effective_cpus);
- if (remote)
- goto get_css;
-
/*
* Skip the whole subtree if
* 1) the cpumask remains the same,
@@ -2088,6 +2141,9 @@ get_css:
spin_lock_irq(&callback_lock);
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
cp->partition_root_state = new_prs;
+ if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs))
+ compute_effective_exclusive_cpumask(cp, NULL, NULL);
+
/*
* Make sure effective_xcpus is properly set for a valid
* partition root.
@@ -2174,7 +2230,14 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
parent);
if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
continue;
+ } else if (is_remote_partition(sibling)) {
+ /*
+ * Change in a sibling cpuset won't affect a remote
+ * partition root.
+ */
+ continue;
}
+
if (!css_tryget_online(&sibling->css))
continue;
@@ -2231,8 +2294,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* trialcs->effective_xcpus is used as a temporary cpumask
* for checking validity of the partition root.
*/
+ trialcs->partition_root_state = PRS_MEMBER;
if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs))
- compute_effective_exclusive_cpumask(trialcs, NULL);
+ compute_effective_exclusive_cpumask(trialcs, NULL, cs);
}
/* Nothing to do if the cpus didn't change */
@@ -2305,19 +2369,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Call remote_cpus_update() to handle valid remote partition
*/
if (is_remote_partition(cs))
- remote_cpus_update(cs, xcpus, &tmp);
+ remote_cpus_update(cs, NULL, xcpus, &tmp);
else if (invalidate)
update_parent_effective_cpumask(cs, partcmd_invalidate,
NULL, &tmp);
else
update_parent_effective_cpumask(cs, partcmd_update,
xcpus, &tmp);
- } else if (!cpumask_empty(cs->exclusive_cpus)) {
- /*
- * Use trialcs->effective_cpus as a temp cpumask
- */
- remote_partition_check(cs, trialcs->effective_xcpus,
- trialcs->effective_cpus, &tmp);
}
spin_lock_irq(&callback_lock);
@@ -2369,8 +2427,15 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
return 0;
- if (*buf)
- compute_effective_exclusive_cpumask(trialcs, NULL);
+ if (*buf) {
+ trialcs->partition_root_state = PRS_MEMBER;
+ /*
+ * Reject the change if there is exclusive CPUs conflict with
+ * the siblings.
+ */
+ if (compute_effective_exclusive_cpumask(trialcs, NULL, cs))
+ return -EINVAL;
+ }
/*
* Check all the descendants in update_cpumasks_hier() if
@@ -2401,8 +2466,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (invalidate)
remote_partition_disable(cs, &tmp);
else
- remote_cpus_update(cs, trialcs->effective_xcpus,
- &tmp);
+ remote_cpus_update(cs, trialcs->exclusive_cpus,
+ trialcs->effective_xcpus, &tmp);
} else if (invalidate) {
update_parent_effective_cpumask(cs, partcmd_invalidate,
NULL, &tmp);
@@ -2410,12 +2475,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
update_parent_effective_cpumask(cs, partcmd_update,
trialcs->effective_xcpus, &tmp);
}
- } else if (!cpumask_empty(trialcs->exclusive_cpus)) {
- /*
- * Use trialcs->effective_cpus as a temp cpumask
- */
- remote_partition_check(cs, trialcs->effective_xcpus,
- trialcs->effective_cpus, &tmp);
}
spin_lock_irq(&callback_lock);
cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
@@ -2782,7 +2841,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
int err = PERR_NONE, old_prs = cs->partition_root_state;
struct cpuset *parent = parent_cs(cs);
struct tmpmasks tmpmask;
- bool new_xcpus_state = false;
+ bool isolcpus_updated = false;
if (old_prs == new_prs)
return 0;
@@ -2796,18 +2855,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
if (alloc_cpumasks(NULL, &tmpmask))
return -ENOMEM;
- /*
- * Setup effective_xcpus if not properly set yet, it will be cleared
- * later if partition becomes invalid.
- */
- if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) {
- spin_lock_irq(&callback_lock);
- cpumask_and(cs->effective_xcpus,
- cs->cpus_allowed, parent->effective_xcpus);
- spin_unlock_irq(&callback_lock);
- }
-
- err = update_partition_exclusive(cs, new_prs);
+ err = update_partition_exclusive_flag(cs, new_prs);
if (err)
goto out;
@@ -2821,6 +2869,19 @@ static int update_prstate(struct cpuset *cs, int new_prs)
}
/*
+ * We don't support the creation of a new local partition with
+ * a remote partition underneath it. This unsupported
+ * setting can happen only if parent is the top_cpuset because
+ * a remote partition cannot be created underneath an existing
+ * local or remote partition.
+ */
+ if ((parent == &top_cpuset) &&
+ cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) {
+ err = PERR_REMOTE;
+ goto out;
+ }
+
+ /*
* If parent is valid partition, enable local partiion.
* Otherwise, enable a remote partition.
*/
@@ -2835,8 +2896,9 @@ static int update_prstate(struct cpuset *cs, int new_prs)
} else if (old_prs && new_prs) {
/*
* A change in load balance state only, no change in cpumasks.
+ * Need to update isolated_cpus.
*/
- new_xcpus_state = true;
+ isolcpus_updated = true;
} else {
/*
* Switching back to member is always allowed even if it
@@ -2860,7 +2922,7 @@ out:
*/
if (err) {
new_prs = -new_prs;
- update_partition_exclusive(cs, new_prs);
+ update_partition_exclusive_flag(cs, new_prs);
}
spin_lock_irq(&callback_lock);
@@ -2868,14 +2930,18 @@ out:
WRITE_ONCE(cs->prs_err, err);
if (!is_partition_valid(cs))
reset_partition_data(cs);
- else if (new_xcpus_state)
- partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus);
+ else if (isolcpus_updated)
+ isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus);
spin_unlock_irq(&callback_lock);
- update_unbound_workqueue_cpumask(new_xcpus_state);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
- /* Force update if switching back to member */
+ /* Force update if switching back to member & update effective_xcpus */
update_cpumasks_hier(cs, &tmpmask, !new_prs);
+ /* A newly created partition must have effective_xcpus set */
+ WARN_ON_ONCE(!old_prs && (new_prs > 0)
+ && cpumask_empty(cs->effective_xcpus));
+
/* Update sched domains and load balance flag */
update_partition_sd_lb(cs, old_prs);
@@ -3208,7 +3274,7 @@ int cpuset_common_seq_show(struct seq_file *sf, void *v)
return ret;
}
-static int sched_partition_show(struct seq_file *seq, void *v)
+static int cpuset_partition_show(struct seq_file *seq, void *v)
{
struct cpuset *cs = css_cs(seq_css(seq));
const char *err, *type = NULL;
@@ -3239,7 +3305,7 @@ static int sched_partition_show(struct seq_file *seq, void *v)
return 0;
}
-static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
+static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct cpuset *cs = css_cs(of_css(of));
@@ -3260,11 +3326,8 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
css_get(&cs->css);
cpus_read_lock();
mutex_lock(&cpuset_mutex);
- if (!is_cpuset_online(cs))
- goto out_unlock;
-
- retval = update_prstate(cs, val);
-out_unlock:
+ if (is_cpuset_online(cs))
+ retval = update_prstate(cs, val);
mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
css_put(&cs->css);
@@ -3308,8 +3371,8 @@ static struct cftype dfl_files[] = {
{
.name = "cpus.partition",
- .seq_show = sched_partition_show,
- .write = sched_partition_write,
+ .seq_show = cpuset_partition_show,
+ .write = cpuset_partition_write,
.private = FILE_PARTITION_ROOT,
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct cpuset, partition_file),
@@ -3475,9 +3538,6 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
cpus_read_lock();
mutex_lock(&cpuset_mutex);
- if (is_partition_valid(cs))
- update_prstate(cs, 0);
-
if (!cpuset_v2() && is_sched_load_balance(cs))
cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
@@ -3488,6 +3548,22 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
cpus_read_unlock();
}
+static void cpuset_css_killed(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+
+ cpus_read_lock();
+ mutex_lock(&cpuset_mutex);
+
+ /* Reset valid partition back to member */
+ if (is_partition_valid(cs))
+ update_prstate(cs, PRS_MEMBER);
+
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
+
+}
+
static void cpuset_css_free(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
@@ -3609,6 +3685,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.css_alloc = cpuset_css_alloc,
.css_online = cpuset_css_online,
.css_offline = cpuset_css_offline,
+ .css_killed = cpuset_css_killed,
.css_free = cpuset_css_free,
.can_attach = cpuset_can_attach,
.cancel_attach = cpuset_cancel_attach,
@@ -3739,10 +3816,10 @@ retry:
if (remote && cpumask_empty(&new_cpus) &&
partition_is_populated(cs, NULL)) {
+ cs->prs_err = PERR_HOTPLUG;
remote_partition_disable(cs, tmp);
compute_effective_cpumask(&new_cpus, cs, parent);
remote = false;
- cpuset_force_rebuild();
}
/*
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 4bb587d5d34f..b2239156b7de 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -318,10 +318,11 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
might_sleep();
for_each_possible_cpu(cpu) {
- struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu);
+ struct cgroup *pos;
/* Reacquire for each CPU to avoid disabling IRQs too long */
__cgroup_rstat_lock(cgrp, cpu);
+ pos = cgroup_rstat_updated_list(cgrp, cpu);
for (; pos; pos = pos->rstat_flush_next) {
struct cgroup_subsys_state *css;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b08bb34b1718..a59e009e0be4 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2069,11 +2069,6 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.teardown.single = NULL,
.cant_stop = true,
},
- [CPUHP_PERF_PREPARE] = {
- .name = "perf:prepare",
- .startup.single = perf_event_init_cpu,
- .teardown.single = perf_event_exit_cpu,
- },
[CPUHP_RANDOM_PREPARE] = {
.name = "random:prepare",
.startup.single = random_prepare_cpu,
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 3b2bdca9f1d4..77c8d9487a9a 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -336,16 +336,22 @@ static phys_addr_t dma_reserved_default_memory_size __initdata;
static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
{
- if (!rmem->priv) {
- struct dma_coherent_mem *mem;
+ struct dma_coherent_mem *mem = rmem->priv;
+ if (!mem) {
mem = dma_init_coherent_memory(rmem->base, rmem->base,
rmem->size, true);
if (IS_ERR(mem))
return PTR_ERR(mem);
rmem->priv = mem;
}
- dma_assign_coherent_memory(dev, rmem->priv);
+
+ /* Warn if the device potentially can't use the reserved memory */
+ if (mem->device_base + rmem->size - 1 >
+ min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit))
+ dev_warn(dev, "reserved memory is beyond device's set DMA address range\n");
+
+ dma_assign_coherent_memory(dev, mem);
return 0;
}
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 055da410ac71..8df0dfaaca18 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -64,8 +64,7 @@ struct cma *dma_contiguous_default_area;
* Users, who want to set the size of global CMA area for their system
* should use cma= kernel parameter.
*/
-static const phys_addr_t size_bytes __initconst =
- (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M;
+#define size_bytes ((phys_addr_t)CMA_SIZE_MBYTES * SZ_1M)
static phys_addr_t size_cmdline __initdata = -1;
static phys_addr_t base_cmdline __initdata;
static phys_addr_t limit_cmdline __initdata;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index cda127027e48..051a32988040 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -910,6 +910,19 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_coherent_mask);
+static bool __dma_addressing_limited(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
+ dma_get_required_mask(dev))
+ return true;
+
+ if (unlikely(ops) || use_dma_iommu(dev))
+ return false;
+ return !dma_direct_all_ram_mapped(dev);
+}
+
/**
* dma_addressing_limited - return if the device is addressing limited
* @dev: device to check
@@ -920,15 +933,11 @@ EXPORT_SYMBOL(dma_set_coherent_mask);
*/
bool dma_addressing_limited(struct device *dev)
{
- const struct dma_map_ops *ops = get_dma_ops(dev);
-
- if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
- dma_get_required_mask(dev))
- return true;
-
- if (unlikely(ops) || use_dma_iommu(dev))
+ if (!__dma_addressing_limited(dev))
return false;
- return !dma_direct_all_ram_mapped(dev);
+
+ dev_dbg(dev, "device is DMA addressing limited\n");
+ return true;
}
EXPORT_SYMBOL_GPL(dma_addressing_limited);
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 20154572ede9..a8dd1f27417c 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -146,7 +146,7 @@ static inline bool report_single_step(unsigned long work)
return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
}
-static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
+void syscall_exit_work(struct pt_regs *regs, unsigned long work)
{
bool step;
@@ -173,53 +173,6 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
ptrace_report_syscall_exit(regs, step);
}
-/*
- * Syscall specific exit to user mode preparation. Runs with interrupts
- * enabled.
- */
-static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
-{
- unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
- unsigned long nr = syscall_get_nr(current, regs);
-
- CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
-
- if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
- if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
- local_irq_enable();
- }
-
- rseq_syscall(regs);
-
- /*
- * Do one-time syscall specific work. If these work items are
- * enabled, we want to run them exactly once per syscall exit with
- * interrupts enabled.
- */
- if (unlikely(work & SYSCALL_WORK_EXIT))
- syscall_exit_work(regs, work);
-}
-
-static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
-{
- syscall_exit_to_user_mode_prepare(regs);
- local_irq_disable_exit_to_user();
- exit_to_user_mode_prepare(regs);
-}
-
-void syscall_exit_to_user_mode_work(struct pt_regs *regs)
-{
- __syscall_exit_to_user_mode_work(regs);
-}
-
-__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
-{
- instrumentation_begin();
- __syscall_exit_to_user_mode_work(regs);
- instrumentation_end();
- exit_to_user_mode();
-}
-
noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
{
enter_from_user_mode(regs);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 128db74e9eab..f34c99f8ce8f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1270,6 +1270,10 @@ static void put_ctx(struct perf_event_context *ctx)
if (ctx->task && ctx->task != TASK_TOMBSTONE)
put_task_struct(ctx->task);
call_rcu(&ctx->rcu_head, free_ctx);
+ } else {
+ smp_mb__after_atomic(); /* pairs with wait_var_event() */
+ if (ctx->task == TASK_TOMBSTONE)
+ wake_up_var(&ctx->refcount);
}
}
@@ -2167,7 +2171,7 @@ static void perf_put_aux_event(struct perf_event *event)
* If the event is an aux_event, tear down all links to
* it from other events.
*/
- for_each_sibling_event(iter, event->group_leader) {
+ for_each_sibling_event(iter, event) {
if (iter->aux_event != event)
continue;
@@ -2325,7 +2329,11 @@ static void perf_child_detach(struct perf_event *event)
if (WARN_ON_ONCE(!parent_event))
return;
+ /*
+ * Can't check this from an IPI, the holder is likey another CPU.
+ *
lockdep_assert_held(&parent_event->child_mutex);
+ */
sync_child_event(event);
list_del_init(&event->child_list);
@@ -2343,6 +2351,11 @@ event_filter_match(struct perf_event *event)
perf_cgroup_match(event);
}
+static inline bool is_event_in_freq_mode(struct perf_event *event)
+{
+ return event->attr.freq && event->attr.sample_freq;
+}
+
static void
event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
@@ -2380,7 +2393,7 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
if (!is_software_event(event))
cpc->active_oncpu--;
- if (event->attr.freq && event->attr.sample_freq) {
+ if (is_event_in_freq_mode(event)) {
ctx->nr_freq--;
epc->nr_freq--;
}
@@ -2450,8 +2463,9 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
#define DETACH_GROUP 0x01UL
#define DETACH_CHILD 0x02UL
-#define DETACH_DEAD 0x04UL
-#define DETACH_EXIT 0x08UL
+#define DETACH_EXIT 0x04UL
+#define DETACH_REVOKE 0x08UL
+#define DETACH_DEAD 0x10UL
/*
* Cross CPU call to remove a performance event
@@ -2477,12 +2491,15 @@ __perf_remove_from_context(struct perf_event *event,
*/
if (flags & DETACH_EXIT)
state = PERF_EVENT_STATE_EXIT;
+ if (flags & DETACH_REVOKE)
+ state = PERF_EVENT_STATE_REVOKED;
if (flags & DETACH_DEAD) {
event->pending_disable = 1;
state = PERF_EVENT_STATE_DEAD;
}
event_sched_out(event, ctx);
perf_event_set_state(event, min(event->state, state));
+
if (flags & DETACH_GROUP)
perf_group_detach(event);
if (flags & DETACH_CHILD)
@@ -2628,6 +2645,41 @@ void perf_event_disable_inatomic(struct perf_event *event)
static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);
+static void perf_event_unthrottle(struct perf_event *event, bool start)
+{
+ event->hw.interrupts = 0;
+ if (start)
+ event->pmu->start(event, 0);
+ if (event == event->group_leader)
+ perf_log_throttle(event, 1);
+}
+
+static void perf_event_throttle(struct perf_event *event)
+{
+ event->pmu->stop(event, 0);
+ event->hw.interrupts = MAX_INTERRUPTS;
+ if (event == event->group_leader)
+ perf_log_throttle(event, 0);
+}
+
+static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+
+ perf_event_unthrottle(leader, skip_start_event ? leader != event : true);
+ for_each_sibling_event(sibling, leader)
+ perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true);
+}
+
+static void perf_event_throttle_group(struct perf_event *event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+
+ perf_event_throttle(leader);
+ for_each_sibling_event(sibling, leader)
+ perf_event_throttle(sibling);
+}
+
static int
event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
@@ -2656,10 +2708,8 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
* ticks already, also for a heavily scheduling task there is little
* guarantee it'll get a tick in a timely manner.
*/
- if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
- perf_log_throttle(event, 1);
- event->hw.interrupts = 0;
- }
+ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS))
+ perf_event_unthrottle(event, false);
perf_pmu_disable(event->pmu);
@@ -2674,7 +2724,7 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
if (!is_software_event(event))
cpc->active_oncpu++;
- if (event->attr.freq && event->attr.sample_freq) {
+ if (is_event_in_freq_mode(event)) {
ctx->nr_freq++;
epc->nr_freq++;
}
@@ -3943,7 +3993,7 @@ static int merge_sched_in(struct perf_event *event, void *data)
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
if (*perf_event_fasync(event))
- event->pending_kill = POLL_HUP;
+ event->pending_kill = POLL_ERR;
perf_event_wakeup(event);
} else {
@@ -4237,14 +4287,10 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
hwc = &event->hw;
- if (hwc->interrupts == MAX_INTERRUPTS) {
- hwc->interrupts = 0;
- perf_log_throttle(event, 1);
- if (!event->attr.freq || !event->attr.sample_freq)
- event->pmu->start(event, 0);
- }
+ if (hwc->interrupts == MAX_INTERRUPTS)
+ perf_event_unthrottle_group(event, is_event_in_freq_mode(event));
- if (!event->attr.freq || !event->attr.sample_freq)
+ if (!is_event_in_freq_mode(event))
continue;
/*
@@ -4516,7 +4562,8 @@ out:
static void perf_remove_from_owner(struct perf_event *event);
static void perf_event_exit_event(struct perf_event *event,
- struct perf_event_context *ctx);
+ struct perf_event_context *ctx,
+ bool revoke);
/*
* Removes all events from the current task that have been marked
@@ -4543,7 +4590,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx)
modified = true;
- perf_event_exit_event(event, ctx);
+ perf_event_exit_event(event, ctx, false);
}
raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -5125,6 +5172,7 @@ static bool is_sb_event(struct perf_event *event)
attr->context_switch || attr->text_poke ||
attr->bpf_event)
return true;
+
return false;
}
@@ -5518,33 +5566,11 @@ static bool exclusive_event_installable(struct perf_event *event,
static void perf_free_addr_filters(struct perf_event *event);
-static void perf_pending_task_sync(struct perf_event *event)
-{
- struct callback_head *head = &event->pending_task;
-
- if (!event->pending_work)
- return;
- /*
- * If the task is queued to the current task's queue, we
- * obviously can't wait for it to complete. Simply cancel it.
- */
- if (task_work_cancel(current, head)) {
- event->pending_work = 0;
- local_dec(&event->ctx->nr_no_switch_fast);
- return;
- }
-
- /*
- * All accesses related to the event are within the same RCU section in
- * perf_pending_task(). The RCU grace period before the event is freed
- * will make sure all those accesses are complete by then.
- */
- rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE);
-}
-
/* vs perf_event_alloc() error */
static void __free_event(struct perf_event *event)
{
+ struct pmu *pmu = event->pmu;
+
if (event->attach_state & PERF_ATTACH_CALLCHAIN)
put_callchain_buffers();
@@ -5574,6 +5600,7 @@ static void __free_event(struct perf_event *event)
* put_pmu_ctx() needs an event->ctx reference, because of
* epc->ctx.
*/
+ WARN_ON_ONCE(!pmu);
WARN_ON_ONCE(!event->ctx);
WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
put_pmu_ctx(event->pmu_ctx);
@@ -5586,8 +5613,13 @@ static void __free_event(struct perf_event *event)
if (event->ctx)
put_ctx(event->ctx);
- if (event->pmu)
- module_put(event->pmu->module);
+ if (pmu) {
+ module_put(pmu->module);
+ scoped_guard (spinlock, &pmu->events_lock) {
+ list_del(&event->pmu_list);
+ wake_up_var(pmu);
+ }
+ }
call_rcu(&event->rcu_head, free_event_rcu);
}
@@ -5599,7 +5631,6 @@ static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending_irq);
irq_work_sync(&event->pending_disable_irq);
- perf_pending_task_sync(event);
unaccount_event(event);
@@ -5625,13 +5656,13 @@ static void _free_event(struct perf_event *event)
/*
* Used to free events which have a known refcount of 1, such as in error paths
- * where the event isn't exposed yet and inherited events.
+ * of inherited events.
*/
static void free_event(struct perf_event *event)
{
if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
- "unexpected event refcount: %ld; ptr=%p\n",
- atomic_long_read(&event->refcount), event)) {
+ "unexpected event refcount: %ld; ptr=%p\n",
+ atomic_long_read(&event->refcount), event)) {
/* leak to avoid use-after-free */
return;
}
@@ -5692,10 +5723,17 @@ static void perf_remove_from_owner(struct perf_event *event)
static void put_event(struct perf_event *event)
{
+ struct perf_event *parent;
+
if (!atomic_long_dec_and_test(&event->refcount))
return;
+ parent = event->parent;
_free_event(event);
+
+ /* Matches the refcount bump in inherit_event() */
+ if (parent)
+ put_event(parent);
}
/*
@@ -5707,7 +5745,6 @@ int perf_event_release_kernel(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct perf_event *child, *tmp;
- LIST_HEAD(free_list);
/*
* If we got here through err_alloc: free_event(event); we will not
@@ -5736,15 +5773,17 @@ int perf_event_release_kernel(struct perf_event *event)
* Thus this guarantees that we will in fact observe and kill _ALL_
* child events.
*/
- perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+ if (event->state > PERF_EVENT_STATE_REVOKED) {
+ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+ } else {
+ event->state = PERF_EVENT_STATE_DEAD;
+ }
perf_event_ctx_unlock(event, ctx);
again:
mutex_lock(&event->child_mutex);
list_for_each_entry(child, &event->child_list, child_list) {
- void *var = NULL;
-
/*
* Cannot change, child events are not migrated, see the
* comment with perf_event_ctx_lock_nested().
@@ -5777,50 +5816,30 @@ again:
tmp = list_first_entry_or_null(&event->child_list,
struct perf_event, child_list);
if (tmp == child) {
- perf_remove_from_context(child, DETACH_GROUP);
- list_move(&child->child_list, &free_list);
- /*
- * This matches the refcount bump in inherit_event();
- * this can't be the last reference.
- */
- put_event(event);
+ perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD);
} else {
- var = &ctx->refcount;
+ child = NULL;
}
mutex_unlock(&event->child_mutex);
mutex_unlock(&ctx->mutex);
- put_ctx(ctx);
- if (var) {
- /*
- * If perf_event_free_task() has deleted all events from the
- * ctx while the child_mutex got released above, make sure to
- * notify about the preceding put_ctx().
- */
- smp_mb(); /* pairs with wait_var_event() */
- wake_up_var(var);
+ if (child) {
+ /* Last reference unless ->pending_task work is pending */
+ put_event(child);
}
+ put_ctx(ctx);
+
goto again;
}
mutex_unlock(&event->child_mutex);
- list_for_each_entry_safe(child, tmp, &free_list, child_list) {
- void *var = &child->ctx->refcount;
-
- list_del(&child->child_list);
- free_event(child);
-
- /*
- * Wake any perf_event_free_task() waiting for this event to be
- * freed.
- */
- smp_mb(); /* pairs with wait_var_event() */
- wake_up_var(var);
- }
-
no_ctx:
- put_event(event); /* Must be the 'last' reference */
+ /*
+ * Last reference unless ->pending_task work is pending on this event
+ * or any of its children.
+ */
+ put_event(event);
return 0;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
@@ -6086,14 +6105,20 @@ static __poll_t perf_poll(struct file *file, poll_table *wait)
struct perf_buffer *rb;
__poll_t events = EPOLLHUP;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
+
poll_wait(file, &event->waitq, wait);
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
+
if (is_event_hup(event))
return events;
if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR &&
event->attr.pinned))
- return events;
+ return EPOLLERR;
/*
* Pin the event->rb by taking event->mmap_mutex; otherwise
@@ -6185,14 +6210,6 @@ static void __perf_event_period(struct perf_event *event,
active = (event->state == PERF_EVENT_STATE_ACTIVE);
if (active) {
perf_pmu_disable(event->pmu);
- /*
- * We could be throttled; unthrottle now to avoid the tick
- * trying to unthrottle while we already re-started the event.
- */
- if (event->hw.interrupts == MAX_INTERRUPTS) {
- event->hw.interrupts = 0;
- perf_log_throttle(event, 1);
- }
event->pmu->stop(event, PERF_EF_UPDATE);
}
@@ -6200,6 +6217,14 @@ static void __perf_event_period(struct perf_event *event,
if (active) {
event->pmu->start(event, PERF_EF_RELOAD);
+ /*
+ * Once the period is force-reset, the event starts immediately.
+ * But the event/group could be throttled. Unthrottle the
+ * event/group now to avoid the next tick trying to unthrottle
+ * while we already re-started the event/group.
+ */
+ if (event->hw.interrupts == MAX_INTERRUPTS)
+ perf_event_unthrottle_group(event, true);
perf_pmu_enable(event->pmu);
}
}
@@ -6257,12 +6282,18 @@ static int perf_event_set_output(struct perf_event *event,
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static int perf_copy_attr(struct perf_event_attr __user *uattr,
struct perf_event_attr *attr);
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie);
static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
void (*func)(struct perf_event *);
u32 flags = arg;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
switch (cmd) {
case PERF_EVENT_IOC_ENABLE:
func = _perf_event_enable;
@@ -6319,7 +6350,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
if (IS_ERR(prog))
return PTR_ERR(prog);
- err = perf_event_set_bpf_prog(event, prog, 0);
+ err = __perf_event_set_bpf_prog(event, prog, 0);
if (err) {
bpf_prog_put(prog);
return err;
@@ -6638,9 +6669,22 @@ void ring_buffer_put(struct perf_buffer *rb)
call_rcu(&rb->rcu_head, rb_free_rcu);
}
+typedef void (*mapped_f)(struct perf_event *event, struct mm_struct *mm);
+
+#define get_mapped(event, func) \
+({ struct pmu *pmu; \
+ mapped_f f = NULL; \
+ guard(rcu)(); \
+ pmu = READ_ONCE(event->pmu); \
+ if (pmu) \
+ f = pmu->func; \
+ f; \
+})
+
static void perf_mmap_open(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
+ mapped_f mapped = get_mapped(event, event_mapped);
atomic_inc(&event->mmap_count);
atomic_inc(&event->rb->mmap_count);
@@ -6648,8 +6692,8 @@ static void perf_mmap_open(struct vm_area_struct *vma)
if (vma->vm_pgoff)
atomic_inc(&event->rb->aux_mmap_count);
- if (event->pmu->event_mapped)
- event->pmu->event_mapped(event, vma->vm_mm);
+ if (mapped)
+ mapped(event, vma->vm_mm);
}
static void perf_pmu_output_stop(struct perf_event *event);
@@ -6665,14 +6709,16 @@ static void perf_pmu_output_stop(struct perf_event *event);
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
+ mapped_f unmapped = get_mapped(event, event_unmapped);
struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
bool detach_rest = false;
- if (event->pmu->event_unmapped)
- event->pmu->event_unmapped(event, vma->vm_mm);
+ /* FIXIES vs perf_pmu_unregister() */
+ if (unmapped)
+ unmapped(event, vma->vm_mm);
/*
* The AUX buffer is strictly a sub-buffer, serialize using aux_mutex
@@ -6865,6 +6911,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
unsigned long nr_pages;
long user_extra = 0, extra = 0;
int ret, flags = 0;
+ mapped_f mapped;
/*
* Don't allow mmap() of inherited per-task counters. This would
@@ -6895,6 +6942,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
mutex_lock(&event->mmap_mutex);
ret = -EINVAL;
+ /*
+ * This relies on __pmu_detach_event() taking mmap_mutex after marking
+ * the event REVOKED. Either we observe the state, or __pmu_detach_event()
+ * will detach the rb created here.
+ */
+ if (event->state <= PERF_EVENT_STATE_REVOKED) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+
if (vma->vm_pgoff == 0) {
nr_pages -= 1;
@@ -7073,8 +7130,9 @@ aux_unlock:
if (!ret)
ret = map_range(rb, vma);
- if (!ret && event->pmu->event_mapped)
- event->pmu->event_mapped(event, vma->vm_mm);
+ mapped = get_mapped(event, event_mapped);
+ if (mapped)
+ mapped(event, vma->vm_mm);
return ret;
}
@@ -7085,6 +7143,9 @@ static int perf_fasync(int fd, struct file *filp, int on)
struct perf_event *event = filp->private_data;
int retval;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
inode_lock(inode);
retval = fasync_helper(fd, filp, on, &event->fasync);
inode_unlock(inode);
@@ -7236,12 +7297,6 @@ static void perf_pending_task(struct callback_head *head)
int rctx;
/*
- * All accesses to the event must belong to the same implicit RCU read-side
- * critical section as the ->pending_work reset. See comment in
- * perf_pending_task_sync().
- */
- rcu_read_lock();
- /*
* If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'.
*/
@@ -7251,9 +7306,8 @@ static void perf_pending_task(struct callback_head *head)
event->pending_work = 0;
perf_sigtrap(event);
local_dec(&event->ctx->nr_no_switch_fast);
- rcuwait_wake_up(&event->pending_work_wait);
}
- rcu_read_unlock();
+ put_event(event);
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
@@ -9971,7 +10025,7 @@ void perf_event_text_poke(const void *addr, const void *old_bytes,
void perf_event_itrace_started(struct perf_event *event)
{
- event->attach_state |= PERF_ATTACH_ITRACE;
+ WRITE_ONCE(event->attach_state, event->attach_state | PERF_ATTACH_ITRACE);
}
static void perf_log_itrace_start(struct perf_event *event)
@@ -10054,14 +10108,13 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
hwc->interrupts = 1;
} else {
hwc->interrupts++;
- if (unlikely(throttle &&
- hwc->interrupts > max_samples_per_tick)) {
- __this_cpu_inc(perf_throttled_count);
- tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
- hwc->interrupts = MAX_INTERRUPTS;
- perf_log_throttle(event, 0);
- ret = 1;
- }
+ }
+
+ if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
+ __this_cpu_inc(perf_throttled_count);
+ tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
+ perf_event_throttle_group(event);
+ ret = 1;
}
if (event->attr.freq) {
@@ -10248,6 +10301,7 @@ static int __perf_event_overflow(struct perf_event *event,
!task_work_add(current, &event->pending_task, notify_mode)) {
event->pending_work = pending_id;
local_inc(&event->ctx->nr_no_switch_fast);
+ WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
event->pending_addr = 0;
if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
@@ -11093,11 +11147,15 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
return false;
}
-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
- u64 bpf_cookie)
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
{
bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
if (!perf_event_is_tracing(event))
return perf_event_set_bpf_handler(event, prog, bpf_cookie);
@@ -11132,6 +11190,20 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
}
+int perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
+
void perf_event_free_bpf_prog(struct perf_event *event)
{
if (!event->prog)
@@ -11154,7 +11226,15 @@ static void perf_event_free_filter(struct perf_event *event)
{
}
-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ return -ENOENT;
+}
+
+int perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
u64 bpf_cookie)
{
return -ENOENT;
@@ -12259,6 +12339,9 @@ int perf_pmu_register(struct pmu *_pmu, const char *name, int type)
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
+ INIT_LIST_HEAD(&pmu->events);
+ spin_lock_init(&pmu->events_lock);
+
/*
* Now that the PMU is complete, make it visible to perf_try_init_event().
*/
@@ -12272,21 +12355,143 @@ int perf_pmu_register(struct pmu *_pmu, const char *name, int type)
}
EXPORT_SYMBOL_GPL(perf_pmu_register);
-void perf_pmu_unregister(struct pmu *pmu)
+static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ /*
+ * De-schedule the event and mark it REVOKED.
+ */
+ perf_event_exit_event(event, ctx, true);
+
+ /*
+ * All _free_event() bits that rely on event->pmu:
+ *
+ * Notably, perf_mmap() relies on the ordering here.
+ */
+ scoped_guard (mutex, &event->mmap_mutex) {
+ WARN_ON_ONCE(pmu->event_unmapped);
+ /*
+ * Mostly an empty lock sequence, such that perf_mmap(), which
+ * relies on mmap_mutex, is sure to observe the state change.
+ */
+ }
+
+ perf_event_free_bpf_prog(event);
+ perf_free_addr_filters(event);
+
+ if (event->destroy) {
+ event->destroy(event);
+ event->destroy = NULL;
+ }
+
+ if (event->pmu_ctx) {
+ put_pmu_ctx(event->pmu_ctx);
+ event->pmu_ctx = NULL;
+ }
+
+ exclusive_event_destroy(event);
+ module_put(pmu->module);
+
+ event->pmu = NULL; /* force fault instead of UAF */
+}
+
+static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+
+ ctx = perf_event_ctx_lock(event);
+ __pmu_detach_event(pmu, event, ctx);
+ perf_event_ctx_unlock(event, ctx);
+
+ scoped_guard (spinlock, &pmu->events_lock)
+ list_del(&event->pmu_list);
+}
+
+static struct perf_event *pmu_get_event(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ guard(spinlock)(&pmu->events_lock);
+ list_for_each_entry(event, &pmu->events, pmu_list) {
+ if (atomic_long_inc_not_zero(&event->refcount))
+ return event;
+ }
+
+ return NULL;
+}
+
+static bool pmu_empty(struct pmu *pmu)
+{
+ guard(spinlock)(&pmu->events_lock);
+ return list_empty(&pmu->events);
+}
+
+static void pmu_detach_events(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ for (;;) {
+ event = pmu_get_event(pmu);
+ if (!event)
+ break;
+
+ pmu_detach_event(pmu, event);
+ put_event(event);
+ }
+
+ /*
+ * wait for pending _free_event()s
+ */
+ wait_var_event(pmu, pmu_empty(pmu));
+}
+
+int perf_pmu_unregister(struct pmu *pmu)
{
scoped_guard (mutex, &pmus_lock) {
+ if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL))
+ return -EINVAL;
+
list_del_rcu(&pmu->entry);
- idr_remove(&pmu_idr, pmu->type);
}
/*
* We dereference the pmu list under both SRCU and regular RCU, so
* synchronize against both of those.
+ *
+ * Notably, the entirety of event creation, from perf_init_event()
+ * (which will now fail, because of the above) until
+ * perf_install_in_context() should be under SRCU such that
+ * this synchronizes against event creation. This avoids trying to
+ * detach events that are not fully formed.
*/
synchronize_srcu(&pmus_srcu);
synchronize_rcu();
+ if (pmu->event_unmapped && !pmu_empty(pmu)) {
+ /*
+ * Can't force remove events when pmu::event_unmapped()
+ * is used in perf_mmap_close().
+ */
+ guard(mutex)(&pmus_lock);
+ idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu);
+ list_add_rcu(&pmu->entry, &pmus);
+ return -EBUSY;
+ }
+
+ scoped_guard (mutex, &pmus_lock)
+ idr_remove(&pmu_idr, pmu->type);
+
+ /*
+ * PMU is removed from the pmus list, so no new events will
+ * be created, now take care of the existing ones.
+ */
+ pmu_detach_events(pmu);
+
+ /*
+ * PMU is unused, make it go away.
+ */
perf_pmu_free(pmu);
+ return 0;
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
@@ -12380,7 +12585,7 @@ static struct pmu *perf_init_event(struct perf_event *event)
struct pmu *pmu;
int type, ret;
- guard(srcu)(&pmus_srcu);
+ guard(srcu)(&pmus_srcu); /* pmu idr/list access */
/*
* Save original type before calling pmu->event_init() since certain
@@ -12604,13 +12809,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
INIT_LIST_HEAD(&event->active_entry);
INIT_LIST_HEAD(&event->addr_filters.list);
INIT_HLIST_NODE(&event->hlist_entry);
+ INIT_LIST_HEAD(&event->pmu_list);
init_waitqueue_head(&event->waitq);
init_irq_work(&event->pending_irq, perf_pending_irq);
event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable);
init_task_work(&event->pending_task, perf_pending_task);
- rcuwait_init(&event->pending_work_wait);
mutex_init(&event->mmap_mutex);
raw_spin_lock_init(&event->addr_filters.lock);
@@ -12676,7 +12881,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
hwc = &event->hw;
hwc->sample_period = attr->sample_period;
- if (attr->freq && attr->sample_freq)
+ if (is_event_in_freq_mode(event))
hwc->sample_period = 1;
hwc->last_period = hwc->sample_period;
@@ -12783,6 +12988,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ lockdep_assert_held(&pmus_srcu);
+ scoped_guard (spinlock, &pmu->events_lock)
+ list_add(&event->pmu_list, &pmu->events);
+
return_ptr(event);
}
@@ -12982,6 +13194,9 @@ set:
goto unlock;
if (output_event) {
+ if (output_event->state <= PERF_EVENT_STATE_REVOKED)
+ goto unlock;
+
/* get the rb we want to redirect to */
rb = ring_buffer_get(output_event);
if (!rb)
@@ -13163,6 +13378,11 @@ SYSCALL_DEFINE5(perf_event_open,
if (event_fd < 0)
return event_fd;
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
+
CLASS(fd, group)(group_fd); // group_fd == -1 => empty
if (group_fd != -1) {
if (!is_perf_file(group)) {
@@ -13170,6 +13390,10 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_fd;
}
group_leader = fd_file(group)->private_data;
+ if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
+ err = -ENODEV;
+ goto err_fd;
+ }
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -13466,7 +13690,7 @@ err_cred:
if (task)
up_read(&task->signal->exec_update_lock);
err_alloc:
- free_event(event);
+ put_event(event);
err_task:
if (task)
put_task_struct(task);
@@ -13503,6 +13727,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
if (attr->aux_output || attr->aux_action)
return ERR_PTR(-EINVAL);
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
+
event = perf_event_alloc(attr, cpu, task, NULL, NULL,
overflow_handler, context, -1);
if (IS_ERR(event)) {
@@ -13574,7 +13803,7 @@ err_unlock:
perf_unpin_context(ctx);
put_ctx(ctx);
err_alloc:
- free_event(event);
+ put_event(event);
err:
return ERR_PTR(err);
}
@@ -13714,10 +13943,12 @@ static void sync_child_event(struct perf_event *child_event)
}
static void
-perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
+perf_event_exit_event(struct perf_event *event,
+ struct perf_event_context *ctx, bool revoke)
{
struct perf_event *parent_event = event->parent;
- unsigned long detach_flags = 0;
+ unsigned long detach_flags = DETACH_EXIT;
+ unsigned int attach_state;
if (parent_event) {
/*
@@ -13732,23 +13963,38 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
* Do destroy all inherited groups, we don't care about those
* and being thorough is better.
*/
- detach_flags = DETACH_GROUP | DETACH_CHILD;
+ detach_flags |= DETACH_GROUP | DETACH_CHILD;
mutex_lock(&parent_event->child_mutex);
+ /* PERF_ATTACH_ITRACE might be set concurrently */
+ attach_state = READ_ONCE(event->attach_state);
}
- perf_remove_from_context(event, detach_flags | DETACH_EXIT);
+ if (revoke)
+ detach_flags |= DETACH_GROUP | DETACH_REVOKE;
+ perf_remove_from_context(event, detach_flags);
/*
* Child events can be freed.
*/
if (parent_event) {
mutex_unlock(&parent_event->child_mutex);
+
/*
- * Kick perf_poll() for is_event_hup();
+ * Match the refcount initialization. Make sure it doesn't happen
+ * twice if pmu_detach_event() calls it on an already exited task.
*/
- perf_event_wakeup(parent_event);
- free_event(event);
- put_event(parent_event);
+ if (attach_state & PERF_ATTACH_CHILD) {
+ /*
+ * Kick perf_poll() for is_event_hup();
+ */
+ perf_event_wakeup(parent_event);
+ /*
+ * pmu_detach_event() will have an extra refcount.
+ * perf_pending_task() might have one too.
+ */
+ put_event(event);
+ }
+
return;
}
@@ -13758,15 +14004,13 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
perf_event_wakeup(event);
}
-static void perf_event_exit_task_context(struct task_struct *child)
+static void perf_event_exit_task_context(struct task_struct *task, bool exit)
{
- struct perf_event_context *child_ctx, *clone_ctx = NULL;
+ struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_event *child_event, *next;
- WARN_ON_ONCE(child != current);
-
- child_ctx = perf_pin_task_context(child);
- if (!child_ctx)
+ ctx = perf_pin_task_context(task);
+ if (!ctx)
return;
/*
@@ -13779,27 +14023,28 @@ static void perf_event_exit_task_context(struct task_struct *child)
* without ctx::mutex (it cannot because of the move_group double mutex
* lock thing). See the comments in perf_install_in_context().
*/
- mutex_lock(&child_ctx->mutex);
+ mutex_lock(&ctx->mutex);
/*
* In a single ctx::lock section, de-schedule the events and detach the
* context from the task such that we cannot ever get it scheduled back
* in.
*/
- raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(child_ctx, NULL, EVENT_ALL);
+ raw_spin_lock_irq(&ctx->lock);
+ if (exit)
+ task_ctx_sched_out(ctx, NULL, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
* and mark the context dead.
*/
- RCU_INIT_POINTER(child->perf_event_ctxp, NULL);
- put_ctx(child_ctx); /* cannot be last */
- WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
- put_task_struct(current); /* cannot be last */
+ RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
+ put_ctx(ctx); /* cannot be last */
+ WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+ put_task_struct(task); /* cannot be last */
- clone_ctx = unclone_ctx(child_ctx);
- raw_spin_unlock_irq(&child_ctx->lock);
+ clone_ctx = unclone_ctx(ctx);
+ raw_spin_unlock_irq(&ctx->lock);
if (clone_ctx)
put_ctx(clone_ctx);
@@ -13809,28 +14054,48 @@ static void perf_event_exit_task_context(struct task_struct *child)
* won't get any samples after PERF_RECORD_EXIT. We can however still
* get a few PERF_RECORD_READ events.
*/
- perf_event_task(child, child_ctx, 0);
+ if (exit)
+ perf_event_task(task, ctx, 0);
- list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- perf_event_exit_event(child_event, child_ctx);
+ list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry)
+ perf_event_exit_event(child_event, ctx, false);
- mutex_unlock(&child_ctx->mutex);
+ mutex_unlock(&ctx->mutex);
- put_ctx(child_ctx);
+ if (!exit) {
+ /*
+ * perf_event_release_kernel() could still have a reference on
+ * this context. In that case we must wait for these events to
+ * have been freed (in particular all their references to this
+ * task must've been dropped).
+ *
+ * Without this copy_process() will unconditionally free this
+ * task (irrespective of its reference count) and
+ * _free_event()'s put_task_struct(event->hw.target) will be a
+ * use-after-free.
+ *
+ * Wait for all events to drop their context reference.
+ */
+ wait_var_event(&ctx->refcount,
+ refcount_read(&ctx->refcount) == 1);
+ }
+ put_ctx(ctx);
}
/*
- * When a child task exits, feed back event values to parent events.
+ * When a task exits, feed back event values to parent events.
*
* Can be called with exec_update_lock held when called from
* setup_new_exec().
*/
-void perf_event_exit_task(struct task_struct *child)
+void perf_event_exit_task(struct task_struct *task)
{
struct perf_event *event, *tmp;
- mutex_lock(&child->perf_event_mutex);
- list_for_each_entry_safe(event, tmp, &child->perf_event_list,
+ WARN_ON_ONCE(task != current);
+
+ mutex_lock(&task->perf_event_mutex);
+ list_for_each_entry_safe(event, tmp, &task->perf_event_list,
owner_entry) {
list_del_init(&event->owner_entry);
@@ -13841,44 +14106,23 @@ void perf_event_exit_task(struct task_struct *child)
*/
smp_store_release(&event->owner, NULL);
}
- mutex_unlock(&child->perf_event_mutex);
+ mutex_unlock(&task->perf_event_mutex);
- perf_event_exit_task_context(child);
+ perf_event_exit_task_context(task, true);
/*
* The perf_event_exit_task_context calls perf_event_task
- * with child's task_ctx, which generates EXIT events for
- * child contexts and sets child->perf_event_ctxp[] to NULL.
+ * with task's task_ctx, which generates EXIT events for
+ * task contexts and sets task->perf_event_ctxp[] to NULL.
* At this point we need to send EXIT events to cpu contexts.
*/
- perf_event_task(child, NULL, 0);
+ perf_event_task(task, NULL, 0);
/*
* Detach the perf_ctx_data for the system-wide event.
*/
guard(percpu_read)(&global_ctx_data_rwsem);
- detach_task_ctx_data(child);
-}
-
-static void perf_free_event(struct perf_event *event,
- struct perf_event_context *ctx)
-{
- struct perf_event *parent = event->parent;
-
- if (WARN_ON_ONCE(!parent))
- return;
-
- mutex_lock(&parent->child_mutex);
- list_del_init(&event->child_list);
- mutex_unlock(&parent->child_mutex);
-
- put_event(parent);
-
- raw_spin_lock_irq(&ctx->lock);
- perf_group_detach(event);
- list_del_event(event, ctx);
- raw_spin_unlock_irq(&ctx->lock);
- free_event(event);
+ detach_task_ctx_data(task);
}
/*
@@ -13890,48 +14134,7 @@ static void perf_free_event(struct perf_event *event,
*/
void perf_event_free_task(struct task_struct *task)
{
- struct perf_event_context *ctx;
- struct perf_event *event, *tmp;
-
- ctx = rcu_access_pointer(task->perf_event_ctxp);
- if (!ctx)
- return;
-
- mutex_lock(&ctx->mutex);
- raw_spin_lock_irq(&ctx->lock);
- /*
- * Destroy the task <-> ctx relation and mark the context dead.
- *
- * This is important because even though the task hasn't been
- * exposed yet the context has been (through child_list).
- */
- RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
- WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
- put_task_struct(task); /* cannot be last */
- raw_spin_unlock_irq(&ctx->lock);
-
-
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
- perf_free_event(event, ctx);
-
- mutex_unlock(&ctx->mutex);
-
- /*
- * perf_event_release_kernel() could've stolen some of our
- * child events and still have them on its free_list. In that
- * case we must wait for these events to have been freed (in
- * particular all their references to this task must've been
- * dropped).
- *
- * Without this copy_process() will unconditionally free this
- * task (irrespective of its reference count) and
- * _free_event()'s put_task_struct(event->hw.target) will be a
- * use-after-free.
- *
- * Wait for all events to drop their context reference.
- */
- wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
- put_ctx(ctx); /* must be last */
+ perf_event_exit_task_context(task, false);
}
void perf_event_delayed_put(struct task_struct *task)
@@ -14008,6 +14211,14 @@ inherit_event(struct perf_event *parent_event,
if (parent_event->parent)
parent_event = parent_event->parent;
+ if (parent_event->state <= PERF_EVENT_STATE_REVOKED)
+ return NULL;
+
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
+
child_event = perf_event_alloc(&parent_event->attr,
parent_event->cpu,
child,
@@ -14016,6 +14227,9 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
+ get_ctx(child_ctx);
+ child_event->ctx = child_ctx;
+
pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
if (IS_ERR(pmu_ctx)) {
free_event(child_event);
@@ -14037,8 +14251,6 @@ inherit_event(struct perf_event *parent_event,
return NULL;
}
- get_ctx(child_ctx);
-
/*
* Make the child state follow the state of the parent event,
* not its attr.disabled bit. We hold the parent's mutex,
@@ -14059,7 +14271,6 @@ inherit_event(struct perf_event *parent_event,
local64_set(&hwc->period_left, sample_period);
}
- child_event->ctx = child_ctx;
child_event->overflow_handler = parent_event->overflow_handler;
child_event->overflow_handler_context
= parent_event->overflow_handler_context;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 5130b119d0ae..d2aef87c7e9f 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -679,7 +679,15 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
{
bool overwrite = !(flags & RING_BUFFER_WRITABLE);
int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
- int ret = -ENOMEM, max_order;
+ bool use_contiguous_pages = event->pmu->capabilities & (
+ PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_PREFER_LARGE);
+ /*
+ * Initialize max_order to 0 for page allocation. This allocates single
+ * pages to minimize memory fragmentation. This is overridden if the
+ * PMU needs or prefers contiguous pages (use_contiguous_pages = true).
+ */
+ int max_order = 0;
+ int ret = -ENOMEM;
if (!has_aux(event))
return -EOPNOTSUPP;
@@ -689,8 +697,8 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
if (!overwrite) {
/*
- * Watermark defaults to half the buffer, and so does the
- * max_order, to aid PMU drivers in double buffering.
+ * Watermark defaults to half the buffer, to aid PMU drivers
+ * in double buffering.
*/
if (!watermark)
watermark = min_t(unsigned long,
@@ -698,16 +706,19 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
(unsigned long)nr_pages << (PAGE_SHIFT - 1));
/*
- * Use aux_watermark as the basis for chunking to
- * help PMU drivers honor the watermark.
+ * If using contiguous pages, use aux_watermark as the basis
+ * for chunking to help PMU drivers honor the watermark.
*/
- max_order = get_order(watermark);
+ if (use_contiguous_pages)
+ max_order = get_order(watermark);
} else {
/*
- * We need to start with the max_order that fits in nr_pages,
- * not the other way around, hence ilog2() and not get_order.
+ * If using contiguous pages, we need to start with the
+ * max_order that fits in nr_pages, not the other way around,
+ * hence ilog2() and not get_order.
*/
- max_order = ilog2(nr_pages);
+ if (use_contiguous_pages)
+ max_order = ilog2(nr_pages);
watermark = 0;
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 615b4e6d22c7..8d783b5882b6 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1956,6 +1956,9 @@ static void free_ret_instance(struct uprobe_task *utask,
* to-be-reused return instances for future uretprobes. If ri_timer()
* happens to be running right now, though, we fallback to safety and
* just perform RCU-delated freeing of ri.
+ * Admittedly, this is a rather simple use of seqcount, but it nicely
+ * abstracts away all the necessary memory barriers, so we use
+ * a well-supported kernel primitive here.
*/
if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) {
/* immediate reuse of ri without RCU GP is OK */
@@ -2016,12 +2019,20 @@ static void ri_timer(struct timer_list *timer)
/* RCU protects return_instance from freeing. */
guard(rcu)();
- write_seqcount_begin(&utask->ri_seqcount);
+ /*
+ * See free_ret_instance() for notes on seqcount use.
+ * We also employ raw API variants to avoid lockdep false-positive
+ * warning complaining about enabled preemption. The timer can only be
+ * invoked once for a uprobe_task. Therefore there can only be one
+ * writer. The reader does not require an even sequence count to make
+ * progress, so it is OK to remain preemptible on PREEMPT_RT.
+ */
+ raw_write_seqcount_begin(&utask->ri_seqcount);
for_each_ret_instance_rcu(ri, utask->return_instances)
hprobe_expire(&ri->hprobe, false);
- write_seqcount_end(&utask->ri_seqcount);
+ raw_write_seqcount_end(&utask->ri_seqcount);
}
static struct uprobe_task *alloc_utask(void)
diff --git a/kernel/exit.c b/kernel/exit.c
index 1b51dc099f1e..38645039dd8f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -133,8 +133,13 @@ struct release_task_post {
static void __unhash_process(struct release_task_post *post, struct task_struct *p,
bool group_dead)
{
+ struct pid *pid = task_pid(p);
+
nr_threads--;
+
detach_pid(post->pids, p, PIDTYPE_PID);
+ wake_up_all(&pid->wait_pidfd);
+
if (group_dead) {
detach_pid(post->pids, p, PIDTYPE_TGID);
detach_pid(post->pids, p, PIDTYPE_PGID);
@@ -253,7 +258,8 @@ repeat:
pidfs_exit(p);
cgroup_release(p);
- thread_pid = get_pid(p->thread_pid);
+ /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */
+ thread_pid = task_pid(p);
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
@@ -282,8 +288,8 @@ repeat:
}
write_unlock_irq(&tasklist_lock);
+ /* @thread_pid can't go away until free_pids() below */
proc_flush_pid(thread_pid);
- put_pid(thread_pid);
add_device_randomness(&p->se.sum_exec_runtime,
sizeof(p->se.sum_exec_runtime));
free_pids(post.pids);
@@ -936,12 +942,12 @@ void __noreturn do_exit(long code)
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
+ trace_sched_process_exit(tsk, group_dead);
exit_mm();
if (group_dead)
acct_process();
- trace_sched_process_exit(tsk);
exit_sem(tsk);
exit_shm(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index c4b26cd8998b..85afccfdf3b1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -498,10 +498,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
vma_numab_state_init(new);
dup_anon_vma_name(orig, new);
- /* track_pfn_copy() will later take care of copying internal state. */
- if (unlikely(new->vm_flags & VM_PFNMAP))
- untrack_pfn_clear(new);
-
return new;
}
@@ -672,6 +668,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp = vm_area_dup(mpnt);
if (!tmp)
goto fail_nomem;
+
+ /* track_pfn_copy() will later take care of copying internal state. */
+ if (unlikely(tmp->vm_flags & VM_PFNMAP))
+ untrack_pfn_clear(tmp);
+
retval = vma_dup_policy(mpnt, tmp);
if (retval)
goto fail_nomem_policy;
@@ -1305,6 +1306,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
+ futex_mm_init(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
mm->pmd_huge_pte = NULL;
#endif
@@ -1387,6 +1389,7 @@ static inline void __mmput(struct mm_struct *mm)
if (mm->binfmt)
module_put(mm->binfmt->module);
lru_gen_del_mm(mm);
+ futex_hash_free(mm);
mmdrop(mm);
}
@@ -2036,17 +2039,16 @@ static inline void rcu_copy_process(struct task_struct *p)
}
/**
- * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
* @pid: the struct pid for which to create a pidfd
* @flags: flags of the new @pidfd
- * @ret: Where to return the file for the pidfd.
+ * @ret_file: return the new pidfs file
*
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
* caller's file descriptor table. The pidfd is reserved but not installed yet.
*
- * The helper doesn't perform checks on @pid which makes it useful for pidfds
- * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
- * pidfd file are prepared.
+ * The helper verifies that @pid is still in use, without PIDFD_THREAD the
+ * task identified by @pid must be a thread-group leader.
*
* If this function returns successfully the caller is responsible to either
* call fd_install() passing the returned pidfd and pidfd file as arguments in
@@ -2063,59 +2065,50 @@ static inline void rcu_copy_process(struct task_struct *p)
* error, a negative error code is returned from the function and the
* last argument remains unchanged.
*/
-static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file)
{
- struct file *pidfd_file;
+ struct file *pidfs_file;
+
+ /*
+ * PIDFD_STALE is only allowed to be passed if the caller knows
+ * that @pid is already registered in pidfs and thus
+ * PIDFD_INFO_EXIT information is guaranteed to be available.
+ */
+ if (!(flags & PIDFD_STALE)) {
+ /*
+ * While holding the pidfd waitqueue lock removing the
+ * task linkage for the thread-group leader pid
+ * (PIDTYPE_TGID) isn't possible. Thus, if there's still
+ * task linkage for PIDTYPE_PID not having thread-group
+ * leader linkage for the pid means it wasn't a
+ * thread-group leader in the first place.
+ */
+ guard(spinlock_irq)(&pid->wait_pidfd.lock);
+
+ /* Task has already been reaped. */
+ if (!pid_has_task(pid, PIDTYPE_PID))
+ return -ESRCH;
+ /*
+ * If this struct pid isn't used as a thread-group
+ * leader but the caller requested to create a
+ * thread-group leader pidfd then report ENOENT.
+ */
+ if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID))
+ return -ENOENT;
+ }
CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
if (pidfd < 0)
return pidfd;
- pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
- if (IS_ERR(pidfd_file))
- return PTR_ERR(pidfd_file);
+ pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR);
+ if (IS_ERR(pidfs_file))
+ return PTR_ERR(pidfs_file);
- *ret = pidfd_file;
+ *ret_file = pidfs_file;
return take_fd(pidfd);
}
-/**
- * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
- * @pid: the struct pid for which to create a pidfd
- * @flags: flags of the new @pidfd
- * @ret: Where to return the pidfd.
- *
- * Allocate a new file that stashes @pid and reserve a new pidfd number in the
- * caller's file descriptor table. The pidfd is reserved but not installed yet.
- *
- * The helper verifies that @pid is still in use, without PIDFD_THREAD the
- * task identified by @pid must be a thread-group leader.
- *
- * If this function returns successfully the caller is responsible to either
- * call fd_install() passing the returned pidfd and pidfd file as arguments in
- * order to install the pidfd into its file descriptor table or they must use
- * put_unused_fd() and fput() on the returned pidfd and pidfd file
- * respectively.
- *
- * This function is useful when a pidfd must already be reserved but there
- * might still be points of failure afterwards and the caller wants to ensure
- * that no pidfd is leaked into its file descriptor table.
- *
- * Return: On success, a reserved pidfd is returned from the function and a new
- * pidfd file is returned in the last argument to the function. On
- * error, a negative error code is returned from the function and the
- * last argument remains unchanged.
- */
-int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
-{
- bool thread = flags & PIDFD_THREAD;
-
- if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
- return -EINVAL;
-
- return __pidfd_prepare(pid, flags, ret);
-}
-
static void __delayed_free_task(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
@@ -2162,6 +2155,13 @@ static void rv_task_fork(struct task_struct *p)
#define rv_task_fork(p) do {} while (0)
#endif
+static bool need_futex_hash_allocate_default(u64 clone_flags)
+{
+ if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM))
+ return false;
+ return true;
+}
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -2462,7 +2462,7 @@ __latent_entropy struct task_struct *copy_process(
* Note that no task has been attached to @pid yet indicate
* that via CLONE_PIDFD.
*/
- retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile);
+ retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
@@ -2543,6 +2543,21 @@ __latent_entropy struct task_struct *copy_process(
goto bad_fork_cancel_cgroup;
/*
+ * Allocate a default futex hash for the user process once the first
+ * thread spawns.
+ */
+ if (need_futex_hash_allocate_default(clone_flags)) {
+ retval = futex_hash_allocate_default();
+ if (retval)
+ goto bad_fork_core_free;
+ /*
+ * If we fail beyond this point we don't free the allocated
+ * futex hash map. We assume that another thread will be created
+ * and makes use of it. The hash map will be freed once the main
+ * thread terminates.
+ */
+ }
+ /*
* From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do
* not want user-space to be able to predict the process start-time by
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index cca15859a50b..19a2c65f3d37 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -36,9 +36,15 @@
#include <linux/pagemap.h>
#include <linux/debugfs.h>
#include <linux/plist.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
#include <linux/memblock.h>
#include <linux/fault-inject.h>
#include <linux/slab.h>
+#include <linux/prctl.h>
+#include <linux/rcuref.h>
+#include <linux/mempolicy.h>
+#include <linux/mmap_lock.h>
#include "futex.h"
#include "../locking/rtmutex_common.h"
@@ -49,12 +55,24 @@
* reside in the same cacheline.
*/
static struct {
- struct futex_hash_bucket *queues;
unsigned long hashmask;
+ unsigned int hashshift;
+ struct futex_hash_bucket *queues[MAX_NUMNODES];
} __futex_data __read_mostly __aligned(2*sizeof(long));
-#define futex_queues (__futex_data.queues)
-#define futex_hashmask (__futex_data.hashmask)
+#define futex_hashmask (__futex_data.hashmask)
+#define futex_hashshift (__futex_data.hashshift)
+#define futex_queues (__futex_data.queues)
+
+struct futex_private_hash {
+ rcuref_t users;
+ unsigned int hash_mask;
+ struct rcu_head rcu;
+ void *mm;
+ bool custom;
+ bool immutable;
+ struct futex_hash_bucket queues[];
+};
/*
* Fault injections for futexes.
@@ -107,21 +125,328 @@ late_initcall(fail_futex_debugfs);
#endif /* CONFIG_FAIL_FUTEX */
+static struct futex_hash_bucket *
+__futex_hash(union futex_key *key, struct futex_private_hash *fph);
+
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+static inline bool futex_key_is_private(union futex_key *key)
+{
+ /*
+ * Relies on get_futex_key() to set either bit for shared
+ * futexes -- see comment with union futex_key.
+ */
+ return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
+}
+
+bool futex_private_hash_get(struct futex_private_hash *fph)
+{
+ if (fph->immutable)
+ return true;
+ return rcuref_get(&fph->users);
+}
+
+void futex_private_hash_put(struct futex_private_hash *fph)
+{
+ /* Ignore return value, last put is verified via rcuref_is_dead() */
+ if (fph->immutable)
+ return;
+ if (rcuref_put(&fph->users))
+ wake_up_var(fph->mm);
+}
+
/**
- * futex_hash - Return the hash bucket in the global hash
- * @key: Pointer to the futex key for which the hash is calculated
+ * futex_hash_get - Get an additional reference for the local hash.
+ * @hb: ptr to the private local hash.
*
- * We hash on the keys returned from get_futex_key (see below) and return the
- * corresponding hash bucket in the global hash.
+ * Obtain an additional reference for the already obtained hash bucket. The
+ * caller must already own an reference.
*/
+void futex_hash_get(struct futex_hash_bucket *hb)
+{
+ struct futex_private_hash *fph = hb->priv;
+
+ if (!fph)
+ return;
+ WARN_ON_ONCE(!futex_private_hash_get(fph));
+}
+
+void futex_hash_put(struct futex_hash_bucket *hb)
+{
+ struct futex_private_hash *fph = hb->priv;
+
+ if (!fph)
+ return;
+ futex_private_hash_put(fph);
+}
+
+static struct futex_hash_bucket *
+__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
+{
+ u32 hash;
+
+ if (!futex_key_is_private(key))
+ return NULL;
+
+ if (!fph)
+ fph = rcu_dereference(key->private.mm->futex_phash);
+ if (!fph || !fph->hash_mask)
+ return NULL;
+
+ hash = jhash2((void *)&key->private.address,
+ sizeof(key->private.address) / 4,
+ key->both.offset);
+ return &fph->queues[hash & fph->hash_mask];
+}
+
+static void futex_rehash_private(struct futex_private_hash *old,
+ struct futex_private_hash *new)
+{
+ struct futex_hash_bucket *hb_old, *hb_new;
+ unsigned int slots = old->hash_mask + 1;
+ unsigned int i;
+
+ for (i = 0; i < slots; i++) {
+ struct futex_q *this, *tmp;
+
+ hb_old = &old->queues[i];
+
+ spin_lock(&hb_old->lock);
+ plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
+
+ plist_del(&this->list, &hb_old->chain);
+ futex_hb_waiters_dec(hb_old);
+
+ WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);
+
+ hb_new = __futex_hash(&this->key, new);
+ futex_hb_waiters_inc(hb_new);
+ /*
+ * The new pointer isn't published yet but an already
+ * moved user can be unqueued due to timeout or signal.
+ */
+ spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING);
+ plist_add(&this->list, &hb_new->chain);
+ this->lock_ptr = &hb_new->lock;
+ spin_unlock(&hb_new->lock);
+ }
+ spin_unlock(&hb_old->lock);
+ }
+}
+
+static bool __futex_pivot_hash(struct mm_struct *mm,
+ struct futex_private_hash *new)
+{
+ struct futex_private_hash *fph;
+
+ WARN_ON_ONCE(mm->futex_phash_new);
+
+ fph = rcu_dereference_protected(mm->futex_phash,
+ lockdep_is_held(&mm->futex_hash_lock));
+ if (fph) {
+ if (!rcuref_is_dead(&fph->users)) {
+ mm->futex_phash_new = new;
+ return false;
+ }
+
+ futex_rehash_private(fph, new);
+ }
+ rcu_assign_pointer(mm->futex_phash, new);
+ kvfree_rcu(fph, rcu);
+ return true;
+}
+
+static void futex_pivot_hash(struct mm_struct *mm)
+{
+ scoped_guard(mutex, &mm->futex_hash_lock) {
+ struct futex_private_hash *fph;
+
+ fph = mm->futex_phash_new;
+ if (fph) {
+ mm->futex_phash_new = NULL;
+ __futex_pivot_hash(mm, fph);
+ }
+ }
+}
+
+struct futex_private_hash *futex_private_hash(void)
+{
+ struct mm_struct *mm = current->mm;
+ /*
+ * Ideally we don't loop. If there is a replacement in progress
+ * then a new private hash is already prepared and a reference can't be
+ * obtained once the last user dropped it's.
+ * In that case we block on mm_struct::futex_hash_lock and either have
+ * to perform the replacement or wait while someone else is doing the
+ * job. Eitherway, on the second iteration we acquire a reference on the
+ * new private hash or loop again because a new replacement has been
+ * requested.
+ */
+again:
+ scoped_guard(rcu) {
+ struct futex_private_hash *fph;
+
+ fph = rcu_dereference(mm->futex_phash);
+ if (!fph)
+ return NULL;
+
+ if (fph->immutable)
+ return fph;
+ if (rcuref_get(&fph->users))
+ return fph;
+ }
+ futex_pivot_hash(mm);
+ goto again;
+}
+
struct futex_hash_bucket *futex_hash(union futex_key *key)
{
- u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
- key->both.offset);
+ struct futex_private_hash *fph;
+ struct futex_hash_bucket *hb;
+
+again:
+ scoped_guard(rcu) {
+ hb = __futex_hash(key, NULL);
+ fph = hb->priv;
+
+ if (!fph || futex_private_hash_get(fph))
+ return hb;
+ }
+ futex_pivot_hash(key->private.mm);
+ goto again;
+}
+
+#else /* !CONFIG_FUTEX_PRIVATE_HASH */
+
+static struct futex_hash_bucket *
+__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
+{
+ return NULL;
+}
+
+struct futex_hash_bucket *futex_hash(union futex_key *key)
+{
+ return __futex_hash(key, NULL);
+}
+
+#endif /* CONFIG_FUTEX_PRIVATE_HASH */
+
+#ifdef CONFIG_FUTEX_MPOL
+
+static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma = vma_lookup(mm, addr);
+ struct mempolicy *mpol;
+ int node = FUTEX_NO_NODE;
+
+ if (!vma)
+ return FUTEX_NO_NODE;
+
+ mpol = vma_policy(vma);
+ if (!mpol)
+ return FUTEX_NO_NODE;
+
+ switch (mpol->mode) {
+ case MPOL_PREFERRED:
+ node = first_node(mpol->nodes);
+ break;
+ case MPOL_PREFERRED_MANY:
+ case MPOL_BIND:
+ if (mpol->home_node != NUMA_NO_NODE)
+ node = mpol->home_node;
+ break;
+ default:
+ break;
+ }
+
+ return node;
+}
+
+static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr)
+{
+ int seq, node;
+
+ guard(rcu)();
+
+ if (!mmap_lock_speculate_try_begin(mm, &seq))
+ return -EBUSY;
+
+ node = __futex_key_to_node(mm, addr);
+
+ if (mmap_lock_speculate_retry(mm, seq))
+ return -EAGAIN;
+
+ return node;
+}
+
+static int futex_mpol(struct mm_struct *mm, unsigned long addr)
+{
+ int node;
+
+ node = futex_key_to_node_opt(mm, addr);
+ if (node >= FUTEX_NO_NODE)
+ return node;
- return &futex_queues[hash & futex_hashmask];
+ guard(mmap_read_lock)(mm);
+ return __futex_key_to_node(mm, addr);
}
+#else /* !CONFIG_FUTEX_MPOL */
+
+static int futex_mpol(struct mm_struct *mm, unsigned long addr)
+{
+ return FUTEX_NO_NODE;
+}
+
+#endif /* CONFIG_FUTEX_MPOL */
+
+/**
+ * __futex_hash - Return the hash bucket
+ * @key: Pointer to the futex key for which the hash is calculated
+ * @fph: Pointer to private hash if known
+ *
+ * We hash on the keys returned from get_futex_key (see below) and return the
+ * corresponding hash bucket.
+ * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the
+ * private hash) is returned if existing. Otherwise a hash bucket from the
+ * global hash is returned.
+ */
+static struct futex_hash_bucket *
+__futex_hash(union futex_key *key, struct futex_private_hash *fph)
+{
+ int node = key->both.node;
+ u32 hash;
+
+ if (node == FUTEX_NO_NODE) {
+ struct futex_hash_bucket *hb;
+
+ hb = __futex_hash_private(key, fph);
+ if (hb)
+ return hb;
+ }
+
+ hash = jhash2((u32 *)key,
+ offsetof(typeof(*key), both.offset) / sizeof(u32),
+ key->both.offset);
+
+ if (node == FUTEX_NO_NODE) {
+ /*
+ * In case of !FLAGS_NUMA, use some unused hash bits to pick a
+ * node -- this ensures regular futexes are interleaved across
+ * the nodes and avoids having to allocate multiple
+ * hash-tables.
+ *
+ * NOTE: this isn't perfectly uniform, but it is fast and
+ * handles sparse node masks.
+ */
+ node = (hash >> futex_hashshift) % nr_node_ids;
+ if (!node_possible(node)) {
+ node = find_next_bit_wrap(node_possible_map.bits,
+ nr_node_ids, node);
+ }
+ }
+
+ return &futex_queues[node][hash & futex_hashmask];
+}
/**
* futex_setup_timer - set up the sleeping hrtimer.
@@ -227,25 +552,60 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
struct page *page;
struct folio *folio;
struct address_space *mapping;
- int err, ro = 0;
+ int node, err, size, ro = 0;
+ bool node_updated = false;
bool fshared;
fshared = flags & FLAGS_SHARED;
+ size = futex_size(flags);
+ if (flags & FLAGS_NUMA)
+ size *= 2;
/*
* The futex address must be "naturally" aligned.
*/
key->both.offset = address % PAGE_SIZE;
- if (unlikely((address % sizeof(u32)) != 0))
+ if (unlikely((address % size) != 0))
return -EINVAL;
address -= key->both.offset;
- if (unlikely(!access_ok(uaddr, sizeof(u32))))
+ if (unlikely(!access_ok(uaddr, size)))
return -EFAULT;
if (unlikely(should_fail_futex(fshared)))
return -EFAULT;
+ node = FUTEX_NO_NODE;
+
+ if (flags & FLAGS_NUMA) {
+ u32 __user *naddr = (void *)uaddr + size / 2;
+
+ if (futex_get_value(&node, naddr))
+ return -EFAULT;
+
+ if (node != FUTEX_NO_NODE &&
+ (node >= MAX_NUMNODES || !node_possible(node)))
+ return -EINVAL;
+ }
+
+ if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL)) {
+ node = futex_mpol(mm, address);
+ node_updated = true;
+ }
+
+ if (flags & FLAGS_NUMA) {
+ u32 __user *naddr = (void *)uaddr + size / 2;
+
+ if (node == FUTEX_NO_NODE) {
+ node = numa_node_id();
+ node_updated = true;
+ }
+ if (node_updated && futex_put_value(node, naddr))
+ return -EFAULT;
+ }
+
+ key->both.node = node;
+
/*
* PROCESS_PRIVATE futexes are fast.
* As the mm cannot disappear under us and the 'key' only needs
@@ -502,13 +862,9 @@ void __futex_unqueue(struct futex_q *q)
}
/* The key must be already stored in q->key. */
-struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
+void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb)
__acquires(&hb->lock)
{
- struct futex_hash_bucket *hb;
-
- hb = futex_hash(&q->key);
-
/*
* Increment the counter before taking the lock so that
* a potential waker won't miss a to-be-slept task that is
@@ -522,14 +878,13 @@ struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
q->lock_ptr = &hb->lock;
spin_lock(&hb->lock);
- return hb;
}
void futex_q_unlock(struct futex_hash_bucket *hb)
__releases(&hb->lock)
{
- spin_unlock(&hb->lock);
futex_hb_waiters_dec(hb);
+ spin_unlock(&hb->lock);
}
void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
@@ -568,6 +923,8 @@ int futex_unqueue(struct futex_q *q)
spinlock_t *lock_ptr;
int ret = 0;
+ /* RCU so lock_ptr is not going away during locking. */
+ guard(rcu)();
/* In the common case we don't take the spinlock, which is nice. */
retry:
/*
@@ -606,6 +963,24 @@ retry:
return ret;
}
+void futex_q_lockptr_lock(struct futex_q *q)
+{
+ spinlock_t *lock_ptr;
+
+ /*
+ * See futex_unqueue() why lock_ptr can change.
+ */
+ guard(rcu)();
+retry:
+ lock_ptr = READ_ONCE(q->lock_ptr);
+ spin_lock(lock_ptr);
+
+ if (unlikely(lock_ptr != q->lock_ptr)) {
+ spin_unlock(lock_ptr);
+ goto retry;
+ }
+}
+
/*
* PI futexes can not be requeued and must remove themselves from the hash
* bucket. The hash bucket lock (i.e. lock_ptr) is held.
@@ -949,10 +1324,20 @@ static void exit_pi_state_list(struct task_struct *curr)
{
struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state;
- struct futex_hash_bucket *hb;
union futex_key key = FUTEX_KEY_INIT;
/*
+ * The mutex mm_struct::futex_hash_lock might be acquired.
+ */
+ might_sleep();
+ /*
+ * Ensure the hash remains stable (no resize) during the while loop
+ * below. The hb pointer is acquired under the pi_lock so we can't block
+ * on the mutex.
+ */
+ WARN_ON(curr != current);
+ guard(private_hash)();
+ /*
* We are a ZOMBIE and nobody can enqueue itself on
* pi_state_list anymore, but we have to be careful
* versus waiters unqueueing themselves:
@@ -962,50 +1347,52 @@ static void exit_pi_state_list(struct task_struct *curr)
next = head->next;
pi_state = list_entry(next, struct futex_pi_state, list);
key = pi_state->key;
- hb = futex_hash(&key);
-
- /*
- * We can race against put_pi_state() removing itself from the
- * list (a waiter going away). put_pi_state() will first
- * decrement the reference count and then modify the list, so
- * its possible to see the list entry but fail this reference
- * acquire.
- *
- * In that case; drop the locks to let put_pi_state() make
- * progress and retry the loop.
- */
- if (!refcount_inc_not_zero(&pi_state->refcount)) {
+ if (1) {
+ CLASS(hb, hb)(&key);
+
+ /*
+ * We can race against put_pi_state() removing itself from the
+ * list (a waiter going away). put_pi_state() will first
+ * decrement the reference count and then modify the list, so
+ * its possible to see the list entry but fail this reference
+ * acquire.
+ *
+ * In that case; drop the locks to let put_pi_state() make
+ * progress and retry the loop.
+ */
+ if (!refcount_inc_not_zero(&pi_state->refcount)) {
+ raw_spin_unlock_irq(&curr->pi_lock);
+ cpu_relax();
+ raw_spin_lock_irq(&curr->pi_lock);
+ continue;
+ }
raw_spin_unlock_irq(&curr->pi_lock);
- cpu_relax();
- raw_spin_lock_irq(&curr->pi_lock);
- continue;
- }
- raw_spin_unlock_irq(&curr->pi_lock);
- spin_lock(&hb->lock);
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
- raw_spin_lock(&curr->pi_lock);
- /*
- * We dropped the pi-lock, so re-check whether this
- * task still owns the PI-state:
- */
- if (head->next != next) {
- /* retain curr->pi_lock for the loop invariant */
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ spin_lock(&hb->lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock(&curr->pi_lock);
+ /*
+ * We dropped the pi-lock, so re-check whether this
+ * task still owns the PI-state:
+ */
+ if (head->next != next) {
+ /* retain curr->pi_lock for the loop invariant */
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ WARN_ON(pi_state->owner != curr);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ pi_state->owner = NULL;
+
+ raw_spin_unlock(&curr->pi_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
- put_pi_state(pi_state);
- continue;
}
- WARN_ON(pi_state->owner != curr);
- WARN_ON(list_empty(&pi_state->list));
- list_del_init(&pi_state->list);
- pi_state->owner = NULL;
-
- raw_spin_unlock(&curr->pi_lock);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
-
rt_mutex_futex_unlock(&pi_state->pi_mutex);
put_pi_state(pi_state);
@@ -1125,30 +1512,304 @@ void futex_exit_release(struct task_struct *tsk)
futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
}
+static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
+ struct futex_private_hash *fph)
+{
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+ fhb->priv = fph;
+#endif
+ atomic_set(&fhb->waiters, 0);
+ plist_head_init(&fhb->chain);
+ spin_lock_init(&fhb->lock);
+}
+
+#define FH_CUSTOM 0x01
+#define FH_IMMUTABLE 0x02
+
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+void futex_hash_free(struct mm_struct *mm)
+{
+ struct futex_private_hash *fph;
+
+ kvfree(mm->futex_phash_new);
+ fph = rcu_dereference_raw(mm->futex_phash);
+ if (fph) {
+ WARN_ON_ONCE(rcuref_read(&fph->users) > 1);
+ kvfree(fph);
+ }
+}
+
+static bool futex_pivot_pending(struct mm_struct *mm)
+{
+ struct futex_private_hash *fph;
+
+ guard(rcu)();
+
+ if (!mm->futex_phash_new)
+ return true;
+
+ fph = rcu_dereference(mm->futex_phash);
+ return rcuref_is_dead(&fph->users);
+}
+
+static bool futex_hash_less(struct futex_private_hash *a,
+ struct futex_private_hash *b)
+{
+ /* user provided always wins */
+ if (!a->custom && b->custom)
+ return true;
+ if (a->custom && !b->custom)
+ return false;
+
+ /* zero-sized hash wins */
+ if (!b->hash_mask)
+ return true;
+ if (!a->hash_mask)
+ return false;
+
+ /* keep the biggest */
+ if (a->hash_mask < b->hash_mask)
+ return true;
+ if (a->hash_mask > b->hash_mask)
+ return false;
+
+ return false; /* equal */
+}
+
+static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct futex_private_hash *fph;
+ bool custom = flags & FH_CUSTOM;
+ int i;
+
+ if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots)))
+ return -EINVAL;
+
+ /*
+ * Once we've disabled the global hash there is no way back.
+ */
+ scoped_guard(rcu) {
+ fph = rcu_dereference(mm->futex_phash);
+ if (fph && (!fph->hash_mask || fph->immutable)) {
+ if (custom)
+ return -EBUSY;
+ return 0;
+ }
+ }
+
+ fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!fph)
+ return -ENOMEM;
+
+ rcuref_init(&fph->users, 1);
+ fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
+ fph->custom = custom;
+ fph->immutable = !!(flags & FH_IMMUTABLE);
+ fph->mm = mm;
+
+ for (i = 0; i < hash_slots; i++)
+ futex_hash_bucket_init(&fph->queues[i], fph);
+
+ if (custom) {
+ /*
+ * Only let prctl() wait / retry; don't unduly delay clone().
+ */
+again:
+ wait_var_event(mm, futex_pivot_pending(mm));
+ }
+
+ scoped_guard(mutex, &mm->futex_hash_lock) {
+ struct futex_private_hash *free __free(kvfree) = NULL;
+ struct futex_private_hash *cur, *new;
+
+ cur = rcu_dereference_protected(mm->futex_phash,
+ lockdep_is_held(&mm->futex_hash_lock));
+ new = mm->futex_phash_new;
+ mm->futex_phash_new = NULL;
+
+ if (fph) {
+ if (cur && !new) {
+ /*
+ * If we have an existing hash, but do not yet have
+ * allocated a replacement hash, drop the initial
+ * reference on the existing hash.
+ */
+ futex_private_hash_put(cur);
+ }
+
+ if (new) {
+ /*
+ * Two updates raced; throw out the lesser one.
+ */
+ if (futex_hash_less(new, fph)) {
+ free = new;
+ new = fph;
+ } else {
+ free = fph;
+ }
+ } else {
+ new = fph;
+ }
+ fph = NULL;
+ }
+
+ if (new) {
+ /*
+ * Will set mm->futex_phash_new on failure;
+ * futex_private_hash_get() will try again.
+ */
+ if (!__futex_pivot_hash(mm, new) && custom)
+ goto again;
+ }
+ }
+ return 0;
+}
+
+int futex_hash_allocate_default(void)
+{
+ unsigned int threads, buckets, current_buckets = 0;
+ struct futex_private_hash *fph;
+
+ if (!current->mm)
+ return 0;
+
+ scoped_guard(rcu) {
+ threads = min_t(unsigned int,
+ get_nr_threads(current),
+ num_online_cpus());
+
+ fph = rcu_dereference(current->mm->futex_phash);
+ if (fph) {
+ if (fph->custom)
+ return 0;
+
+ current_buckets = fph->hash_mask + 1;
+ }
+ }
+
+ /*
+ * The default allocation will remain within
+ * 16 <= threads * 4 <= global hash size
+ */
+ buckets = roundup_pow_of_two(4 * threads);
+ buckets = clamp(buckets, 16, futex_hashmask + 1);
+
+ if (current_buckets >= buckets)
+ return 0;
+
+ return futex_hash_allocate(buckets, 0);
+}
+
+static int futex_hash_get_slots(void)
+{
+ struct futex_private_hash *fph;
+
+ guard(rcu)();
+ fph = rcu_dereference(current->mm->futex_phash);
+ if (fph && fph->hash_mask)
+ return fph->hash_mask + 1;
+ return 0;
+}
+
+static int futex_hash_get_immutable(void)
+{
+ struct futex_private_hash *fph;
+
+ guard(rcu)();
+ fph = rcu_dereference(current->mm->futex_phash);
+ if (fph && fph->immutable)
+ return 1;
+ if (fph && !fph->hash_mask)
+ return 1;
+ return 0;
+}
+
+#else
+
+static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
+{
+ return -EINVAL;
+}
+
+static int futex_hash_get_slots(void)
+{
+ return 0;
+}
+
+static int futex_hash_get_immutable(void)
+{
+ return 0;
+}
+#endif
+
+int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
+{
+ unsigned int flags = FH_CUSTOM;
+ int ret;
+
+ switch (arg2) {
+ case PR_FUTEX_HASH_SET_SLOTS:
+ if (arg4 & ~FH_FLAG_IMMUTABLE)
+ return -EINVAL;
+ if (arg4 & FH_FLAG_IMMUTABLE)
+ flags |= FH_IMMUTABLE;
+ ret = futex_hash_allocate(arg3, flags);
+ break;
+
+ case PR_FUTEX_HASH_GET_SLOTS:
+ ret = futex_hash_get_slots();
+ break;
+
+ case PR_FUTEX_HASH_GET_IMMUTABLE:
+ ret = futex_hash_get_immutable();
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+
static int __init futex_init(void)
{
unsigned long hashsize, i;
- unsigned int futex_shift;
+ unsigned int order, n;
+ unsigned long size;
#ifdef CONFIG_BASE_SMALL
hashsize = 16;
#else
- hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+ hashsize = 256 * num_possible_cpus();
+ hashsize /= num_possible_nodes();
+ hashsize = max(4, hashsize);
+ hashsize = roundup_pow_of_two(hashsize);
#endif
+ futex_hashshift = ilog2(hashsize);
+ size = sizeof(struct futex_hash_bucket) * hashsize;
+ order = get_order(size);
+
+ for_each_node(n) {
+ struct futex_hash_bucket *table;
+
+ if (order > MAX_PAGE_ORDER)
+ table = vmalloc_huge_node(size, GFP_KERNEL, n);
+ else
+ table = alloc_pages_exact_nid(n, size, GFP_KERNEL);
+
+ BUG_ON(!table);
- futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
- hashsize, 0, 0,
- &futex_shift, NULL,
- hashsize, hashsize);
- hashsize = 1UL << futex_shift;
+ for (i = 0; i < hashsize; i++)
+ futex_hash_bucket_init(&table[i], NULL);
- for (i = 0; i < hashsize; i++) {
- atomic_set(&futex_queues[i].waiters, 0);
- plist_head_init(&futex_queues[i].chain);
- spin_lock_init(&futex_queues[i].lock);
+ futex_queues[n] = table;
}
futex_hashmask = hashsize - 1;
+ pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n",
+ hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024,
+ order > MAX_PAGE_ORDER ? "vmalloc" : "linear");
return 0;
}
core_initcall(futex_init);
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 6b2f4c7eb720..fcd1617212ee 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -7,6 +7,7 @@
#include <linux/sched/wake_q.h>
#include <linux/compat.h>
#include <linux/uaccess.h>
+#include <linux/cleanup.h>
#ifdef CONFIG_PREEMPT_RT
#include <linux/rcuwait.h>
@@ -38,6 +39,7 @@
#define FLAGS_HAS_TIMEOUT 0x0040
#define FLAGS_NUMA 0x0080
#define FLAGS_STRICT 0x0100
+#define FLAGS_MPOL 0x0200
/* FUTEX_ to FLAGS_ */
static inline unsigned int futex_to_flags(unsigned int op)
@@ -53,7 +55,7 @@ static inline unsigned int futex_to_flags(unsigned int op)
return flags;
}
-#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)
+#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_MPOL | FUTEX2_PRIVATE)
/* FUTEX2_ to FLAGS_ */
static inline unsigned int futex2_to_flags(unsigned int flags2)
@@ -66,6 +68,9 @@ static inline unsigned int futex2_to_flags(unsigned int flags2)
if (flags2 & FUTEX2_NUMA)
flags |= FLAGS_NUMA;
+ if (flags2 & FUTEX2_MPOL)
+ flags |= FLAGS_MPOL;
+
return flags;
}
@@ -86,6 +91,19 @@ static inline bool futex_flags_valid(unsigned int flags)
if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32)
return false;
+ /*
+ * Must be able to represent both FUTEX_NO_NODE and every valid nodeid
+ * in a futex word.
+ */
+ if (flags & FLAGS_NUMA) {
+ int bits = 8 * futex_size(flags);
+ u64 max = ~0ULL;
+
+ max >>= 64 - bits;
+ if (nr_node_ids >= max)
+ return false;
+ }
+
return true;
}
@@ -117,6 +135,7 @@ struct futex_hash_bucket {
atomic_t waiters;
spinlock_t lock;
struct plist_head chain;
+ struct futex_private_hash *priv;
} ____cacheline_aligned_in_smp;
/*
@@ -156,6 +175,7 @@ typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q);
* @requeue_pi_key: the requeue_pi target futex key
* @bitset: bitset for the optional bitmasked wakeup
* @requeue_state: State field for futex_requeue_pi()
+ * @drop_hb_ref: Waiter should drop the extra hash bucket reference if true
* @requeue_wait: RCU wait for futex_requeue_pi() (RT only)
*
* We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
@@ -182,6 +202,7 @@ struct futex_q {
union futex_key *requeue_pi_key;
u32 bitset;
atomic_t requeue_state;
+ bool drop_hb_ref;
#ifdef CONFIG_PREEMPT_RT
struct rcuwait requeue_wait;
#endif
@@ -196,12 +217,35 @@ enum futex_access {
extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
enum futex_access rw);
-
+extern void futex_q_lockptr_lock(struct futex_q *q);
extern struct hrtimer_sleeper *
futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
int flags, u64 range_ns);
extern struct futex_hash_bucket *futex_hash(union futex_key *key);
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+extern void futex_hash_get(struct futex_hash_bucket *hb);
+extern void futex_hash_put(struct futex_hash_bucket *hb);
+
+extern struct futex_private_hash *futex_private_hash(void);
+extern bool futex_private_hash_get(struct futex_private_hash *fph);
+extern void futex_private_hash_put(struct futex_private_hash *fph);
+
+#else /* !CONFIG_FUTEX_PRIVATE_HASH */
+static inline void futex_hash_get(struct futex_hash_bucket *hb) { }
+static inline void futex_hash_put(struct futex_hash_bucket *hb) { }
+static inline struct futex_private_hash *futex_private_hash(void) { return NULL; }
+static inline bool futex_private_hash_get(void) { return false; }
+static inline void futex_private_hash_put(struct futex_private_hash *fph) { }
+#endif
+
+DEFINE_CLASS(hb, struct futex_hash_bucket *,
+ if (_T) futex_hash_put(_T),
+ futex_hash(key), union futex_key *key);
+
+DEFINE_CLASS(private_hash, struct futex_private_hash *,
+ if (_T) futex_private_hash_put(_T),
+ futex_private_hash(), void);
/**
* futex_match - Check whether two futex keys are equal
@@ -219,9 +263,9 @@ static inline int futex_match(union futex_key *key1, union futex_key *key2)
}
extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
- struct futex_q *q, struct futex_hash_bucket **hb);
-extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
- struct hrtimer_sleeper *timeout);
+ struct futex_q *q, union futex_key *key2,
+ struct task_struct *task);
+extern void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout);
extern bool __futex_wake_mark(struct futex_q *q);
extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
@@ -256,7 +300,7 @@ static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32
* This looks a bit overkill, but generally just results in a couple
* of instructions.
*/
-static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from)
+static __always_inline int futex_get_value(u32 *dest, u32 __user *from)
{
u32 val;
@@ -273,12 +317,26 @@ Efault:
return -EFAULT;
}
+static __always_inline int futex_put_value(u32 val, u32 __user *to)
+{
+ if (can_do_masked_user_access())
+ to = masked_user_access_begin(to);
+ else if (!user_read_access_begin(to, sizeof(*to)))
+ return -EFAULT;
+ unsafe_put_user(val, to, Efault);
+ user_read_access_end();
+ return 0;
+Efault:
+ user_read_access_end();
+ return -EFAULT;
+}
+
static inline int futex_get_value_locked(u32 *dest, u32 __user *from)
{
int ret;
pagefault_disable();
- ret = futex_read_inatomic(dest, from);
+ ret = futex_get_value(dest, from);
pagefault_enable();
return ret;
@@ -354,7 +412,7 @@ static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb)
#endif
}
-extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q);
+extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb);
extern void futex_q_unlock(struct futex_hash_bucket *hb);
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index 7a941845f7ee..dacb2330f1fb 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -806,7 +806,7 @@ handle_err:
break;
}
- spin_lock(q->lock_ptr);
+ futex_q_lockptr_lock(q);
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/*
@@ -920,7 +920,6 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
struct hrtimer_sleeper timeout, *to;
struct task_struct *exiting = NULL;
struct rt_mutex_waiter rt_waiter;
- struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
DEFINE_WAKE_Q(wake_q);
int res, ret;
@@ -939,151 +938,183 @@ retry:
goto out;
retry_private:
- hb = futex_q_lock(&q);
+ if (1) {
+ CLASS(hb, hb)(&q.key);
- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
- &exiting, 0);
- if (unlikely(ret)) {
- /*
- * Atomic work succeeded and we got the lock,
- * or failed. Either way, we do _not_ block.
- */
- switch (ret) {
- case 1:
- /* We got the lock. */
- ret = 0;
- goto out_unlock_put_key;
- case -EFAULT:
- goto uaddr_faulted;
- case -EBUSY:
- case -EAGAIN:
- /*
- * Two reasons for this:
- * - EBUSY: Task is exiting and we just wait for the
- * exit to complete.
- * - EAGAIN: The user space value changed.
- */
- futex_q_unlock(hb);
+ futex_q_lock(&q, hb);
+
+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
+ &exiting, 0);
+ if (unlikely(ret)) {
/*
- * Handle the case where the owner is in the middle of
- * exiting. Wait for the exit to complete otherwise
- * this task might loop forever, aka. live lock.
+ * Atomic work succeeded and we got the lock,
+ * or failed. Either way, we do _not_ block.
*/
- wait_for_owner_exiting(ret, exiting);
- cond_resched();
- goto retry;
- default:
- goto out_unlock_put_key;
+ switch (ret) {
+ case 1:
+ /* We got the lock. */
+ ret = 0;
+ goto out_unlock_put_key;
+ case -EFAULT:
+ goto uaddr_faulted;
+ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - EBUSY: Task is exiting and we just wait for the
+ * exit to complete.
+ * - EAGAIN: The user space value changed.
+ */
+ futex_q_unlock(hb);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock_put_key;
+ }
}
- }
- WARN_ON(!q.pi_state);
+ WARN_ON(!q.pi_state);
- /*
- * Only actually queue now that the atomic ops are done:
- */
- __futex_queue(&q, hb, current);
+ /*
+ * Only actually queue now that the atomic ops are done:
+ */
+ __futex_queue(&q, hb, current);
- if (trylock) {
- ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
- /* Fixup the trylock return value: */
- ret = ret ? 0 : -EWOULDBLOCK;
- goto no_block;
- }
+ if (trylock) {
+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
+ /* Fixup the trylock return value: */
+ ret = ret ? 0 : -EWOULDBLOCK;
+ goto no_block;
+ }
- /*
- * Must be done before we enqueue the waiter, here is unfortunately
- * under the hb lock, but that *should* work because it does nothing.
- */
- rt_mutex_pre_schedule();
+ /*
+ * Caution; releasing @hb in-scope. The hb->lock is still locked
+ * while the reference is dropped. The reference can not be dropped
+ * after the unlock because if a user initiated resize is in progress
+ * then we might need to wake him. This can not be done after the
+ * rt_mutex_pre_schedule() invocation. The hb will remain valid because
+ * the thread, performing resize, will block on hb->lock during
+ * the requeue.
+ */
+ futex_hash_put(no_free_ptr(hb));
+ /*
+ * Must be done before we enqueue the waiter, here is unfortunately
+ * under the hb lock, but that *should* work because it does nothing.
+ */
+ rt_mutex_pre_schedule();
- rt_mutex_init_waiter(&rt_waiter);
+ rt_mutex_init_waiter(&rt_waiter);
- /*
- * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
- * hold it while doing rt_mutex_start_proxy(), because then it will
- * include hb->lock in the blocking chain, even through we'll not in
- * fact hold it while blocking. This will lead it to report -EDEADLK
- * and BUG when futex_unlock_pi() interleaves with this.
- *
- * Therefore acquire wait_lock while holding hb->lock, but drop the
- * latter before calling __rt_mutex_start_proxy_lock(). This
- * interleaves with futex_unlock_pi() -- which does a similar lock
- * handoff -- such that the latter can observe the futex_q::pi_state
- * before __rt_mutex_start_proxy_lock() is done.
- */
- raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
- spin_unlock(q.lock_ptr);
- /*
- * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
- * such that futex_unlock_pi() is guaranteed to observe the waiter when
- * it sees the futex_q::pi_state.
- */
- ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q);
- raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q);
+ /*
+ * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
+ * hold it while doing rt_mutex_start_proxy(), because then it will
+ * include hb->lock in the blocking chain, even through we'll not in
+ * fact hold it while blocking. This will lead it to report -EDEADLK
+ * and BUG when futex_unlock_pi() interleaves with this.
+ *
+ * Therefore acquire wait_lock while holding hb->lock, but drop the
+ * latter before calling __rt_mutex_start_proxy_lock(). This
+ * interleaves with futex_unlock_pi() -- which does a similar lock
+ * handoff -- such that the latter can observe the futex_q::pi_state
+ * before __rt_mutex_start_proxy_lock() is done.
+ */
+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+ spin_unlock(q.lock_ptr);
+ /*
+ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
+ * such that futex_unlock_pi() is guaranteed to observe the waiter when
+ * it sees the futex_q::pi_state.
+ */
+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q);
+ raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q);
- if (ret) {
- if (ret == 1)
- ret = 0;
- goto cleanup;
- }
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+ goto cleanup;
+ }
- if (unlikely(to))
- hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
+ if (unlikely(to))
+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
- ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
cleanup:
- /*
- * If we failed to acquire the lock (deadlock/signal/timeout), we must
- * must unwind the above, however we canont lock hb->lock because
- * rt_mutex already has a waiter enqueued and hb->lock can itself try
- * and enqueue an rt_waiter through rtlock.
- *
- * Doing the cleanup without holding hb->lock can cause inconsistent
- * state between hb and pi_state, but only in the direction of not
- * seeing a waiter that is leaving.
- *
- * See futex_unlock_pi(), it deals with this inconsistency.
- *
- * There be dragons here, since we must deal with the inconsistency on
- * the way out (here), it is impossible to detect/warn about the race
- * the other way around (missing an incoming waiter).
- *
- * What could possibly go wrong...
- */
- if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
- ret = 0;
+ /*
+ * If we failed to acquire the lock (deadlock/signal/timeout), we must
+ * unwind the above, however we canont lock hb->lock because
+ * rt_mutex already has a waiter enqueued and hb->lock can itself try
+ * and enqueue an rt_waiter through rtlock.
+ *
+ * Doing the cleanup without holding hb->lock can cause inconsistent
+ * state between hb and pi_state, but only in the direction of not
+ * seeing a waiter that is leaving.
+ *
+ * See futex_unlock_pi(), it deals with this inconsistency.
+ *
+ * There be dragons here, since we must deal with the inconsistency on
+ * the way out (here), it is impossible to detect/warn about the race
+ * the other way around (missing an incoming waiter).
+ *
+ * What could possibly go wrong...
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;
- /*
- * Now that the rt_waiter has been dequeued, it is safe to use
- * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
- * the
- */
- spin_lock(q.lock_ptr);
- /*
- * Waiter is unqueued.
- */
- rt_mutex_post_schedule();
+ /*
+ * Now that the rt_waiter has been dequeued, it is safe to use
+ * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
+ * the
+ */
+ futex_q_lockptr_lock(&q);
+ /*
+ * Waiter is unqueued.
+ */
+ rt_mutex_post_schedule();
no_block:
- /*
- * Fixup the pi_state owner and possibly acquire the lock if we
- * haven't already.
- */
- res = fixup_pi_owner(uaddr, &q, !ret);
- /*
- * If fixup_pi_owner() returned an error, propagate that. If it acquired
- * the lock, clear our -ETIMEDOUT or -EINTR.
- */
- if (res)
- ret = (res < 0) ? res : 0;
-
- futex_unqueue_pi(&q);
- spin_unlock(q.lock_ptr);
- goto out;
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_pi_owner(uaddr, &q, !ret);
+ /*
+ * If fixup_pi_owner() returned an error, propagate that. If it acquired
+ * the lock, clear our -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;
+
+ futex_unqueue_pi(&q);
+ spin_unlock(q.lock_ptr);
+ if (q.drop_hb_ref) {
+ CLASS(hb, hb)(&q.key);
+ /* Additional reference from futex_unlock_pi() */
+ futex_hash_put(hb);
+ }
+ goto out;
out_unlock_put_key:
- futex_q_unlock(hb);
+ futex_q_unlock(hb);
+ goto out;
+
+uaddr_faulted:
+ futex_q_unlock(hb);
+
+ ret = fault_in_user_writeable(uaddr);
+ if (ret)
+ goto out;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+ }
out:
if (to) {
@@ -1091,18 +1122,6 @@ out:
destroy_hrtimer_on_stack(&to->timer);
}
return ret != -EINTR ? ret : -ERESTARTNOINTR;
-
-uaddr_faulted:
- futex_q_unlock(hb);
-
- ret = fault_in_user_writeable(uaddr);
- if (ret)
- goto out;
-
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
-
- goto retry;
}
/*
@@ -1114,7 +1133,6 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
u32 curval, uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
- struct futex_hash_bucket *hb;
struct futex_q *top_waiter;
int ret;
@@ -1134,7 +1152,7 @@ retry:
if (ret)
return ret;
- hb = futex_hash(&key);
+ CLASS(hb, hb)(&key);
spin_lock(&hb->lock);
retry_hb:
@@ -1187,6 +1205,12 @@ retry_hb:
*/
rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
if (!rt_waiter) {
+ /*
+ * Acquire a reference for the leaving waiter to ensure
+ * valid futex_q::lock_ptr.
+ */
+ futex_hash_get(hb);
+ top_waiter->drop_hb_ref = true;
__futex_unqueue(top_waiter);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
goto retry_hb;
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index b47bb764b352..c716a66f8692 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -87,6 +87,11 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
futex_hb_waiters_inc(hb2);
plist_add(&q->list, &hb2->chain);
q->lock_ptr = &hb2->lock;
+ /*
+ * hb1 and hb2 belong to the same futex_hash_bucket_private
+ * because if we managed get a reference on hb1 then it can't be
+ * replaced. Therefore we avoid put(hb1)+get(hb2) here.
+ */
}
q->key = *key2;
}
@@ -231,7 +236,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
WARN_ON(!q->rt_waiter);
q->rt_waiter = NULL;
-
+ /*
+ * Acquire a reference for the waiter to ensure valid
+ * futex_q::lock_ptr.
+ */
+ futex_hash_get(hb);
+ q->drop_hb_ref = true;
q->lock_ptr = &hb->lock;
/* Signal locked state to the waiter */
@@ -371,7 +381,6 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
int task_count = 0, ret;
struct futex_pi_state *pi_state = NULL;
- struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
DEFINE_WAKE_Q(wake_q);
@@ -443,240 +452,242 @@ retry:
if (requeue_pi && futex_match(&key1, &key2))
return -EINVAL;
- hb1 = futex_hash(&key1);
- hb2 = futex_hash(&key2);
-
retry_private:
- futex_hb_waiters_inc(hb2);
- double_lock_hb(hb1, hb2);
+ if (1) {
+ CLASS(hb, hb1)(&key1);
+ CLASS(hb, hb2)(&key2);
- if (likely(cmpval != NULL)) {
- u32 curval;
+ futex_hb_waiters_inc(hb2);
+ double_lock_hb(hb1, hb2);
- ret = futex_get_value_locked(&curval, uaddr1);
+ if (likely(cmpval != NULL)) {
+ u32 curval;
- if (unlikely(ret)) {
- double_unlock_hb(hb1, hb2);
- futex_hb_waiters_dec(hb2);
+ ret = futex_get_value_locked(&curval, uaddr1);
- ret = get_user(curval, uaddr1);
- if (ret)
- return ret;
+ if (unlikely(ret)) {
+ futex_hb_waiters_dec(hb2);
+ double_unlock_hb(hb1, hb2);
- if (!(flags1 & FLAGS_SHARED))
- goto retry_private;
+ ret = get_user(curval, uaddr1);
+ if (ret)
+ return ret;
- goto retry;
- }
- if (curval != *cmpval) {
- ret = -EAGAIN;
- goto out_unlock;
- }
- }
+ if (!(flags1 & FLAGS_SHARED))
+ goto retry_private;
- if (requeue_pi) {
- struct task_struct *exiting = NULL;
+ goto retry;
+ }
+ if (curval != *cmpval) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ }
- /*
- * Attempt to acquire uaddr2 and wake the top waiter. If we
- * intend to requeue waiters, force setting the FUTEX_WAITERS
- * bit. We force this here where we are able to easily handle
- * faults rather in the requeue loop below.
- *
- * Updates topwaiter::requeue_state if a top waiter exists.
- */
- ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
- &key2, &pi_state,
- &exiting, nr_requeue);
+ if (requeue_pi) {
+ struct task_struct *exiting = NULL;
- /*
- * At this point the top_waiter has either taken uaddr2 or
- * is waiting on it. In both cases pi_state has been
- * established and an initial refcount on it. In case of an
- * error there's nothing.
- *
- * The top waiter's requeue_state is up to date:
- *
- * - If the lock was acquired atomically (ret == 1), then
- * the state is Q_REQUEUE_PI_LOCKED.
- *
- * The top waiter has been dequeued and woken up and can
- * return to user space immediately. The kernel/user
- * space state is consistent. In case that there must be
- * more waiters requeued the WAITERS bit in the user
- * space futex is set so the top waiter task has to go
- * into the syscall slowpath to unlock the futex. This
- * will block until this requeue operation has been
- * completed and the hash bucket locks have been
- * dropped.
- *
- * - If the trylock failed with an error (ret < 0) then
- * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
- * happened", or Q_REQUEUE_PI_IGNORE when there was an
- * interleaved early wakeup.
- *
- * - If the trylock did not succeed (ret == 0) then the
- * state is either Q_REQUEUE_PI_IN_PROGRESS or
- * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
- * This will be cleaned up in the loop below, which
- * cannot fail because futex_proxy_trylock_atomic() did
- * the same sanity checks for requeue_pi as the loop
- * below does.
- */
- switch (ret) {
- case 0:
- /* We hold a reference on the pi state. */
- break;
-
- case 1:
/*
- * futex_proxy_trylock_atomic() acquired the user space
- * futex. Adjust task_count.
+ * Attempt to acquire uaddr2 and wake the top waiter. If we
+ * intend to requeue waiters, force setting the FUTEX_WAITERS
+ * bit. We force this here where we are able to easily handle
+ * faults rather in the requeue loop below.
+ *
+ * Updates topwaiter::requeue_state if a top waiter exists.
*/
- task_count++;
- ret = 0;
- break;
+ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+ &key2, &pi_state,
+ &exiting, nr_requeue);
- /*
- * If the above failed, then pi_state is NULL and
- * waiter::requeue_state is correct.
- */
- case -EFAULT:
- double_unlock_hb(hb1, hb2);
- futex_hb_waiters_dec(hb2);
- ret = fault_in_user_writeable(uaddr2);
- if (!ret)
- goto retry;
- return ret;
- case -EBUSY:
- case -EAGAIN:
- /*
- * Two reasons for this:
- * - EBUSY: Owner is exiting and we just wait for the
- * exit to complete.
- * - EAGAIN: The user space value changed.
- */
- double_unlock_hb(hb1, hb2);
- futex_hb_waiters_dec(hb2);
/*
- * Handle the case where the owner is in the middle of
- * exiting. Wait for the exit to complete otherwise
- * this task might loop forever, aka. live lock.
+ * At this point the top_waiter has either taken uaddr2 or
+ * is waiting on it. In both cases pi_state has been
+ * established and an initial refcount on it. In case of an
+ * error there's nothing.
+ *
+ * The top waiter's requeue_state is up to date:
+ *
+ * - If the lock was acquired atomically (ret == 1), then
+ * the state is Q_REQUEUE_PI_LOCKED.
+ *
+ * The top waiter has been dequeued and woken up and can
+ * return to user space immediately. The kernel/user
+ * space state is consistent. In case that there must be
+ * more waiters requeued the WAITERS bit in the user
+ * space futex is set so the top waiter task has to go
+ * into the syscall slowpath to unlock the futex. This
+ * will block until this requeue operation has been
+ * completed and the hash bucket locks have been
+ * dropped.
+ *
+ * - If the trylock failed with an error (ret < 0) then
+ * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
+ * happened", or Q_REQUEUE_PI_IGNORE when there was an
+ * interleaved early wakeup.
+ *
+ * - If the trylock did not succeed (ret == 0) then the
+ * state is either Q_REQUEUE_PI_IN_PROGRESS or
+ * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
+ * This will be cleaned up in the loop below, which
+ * cannot fail because futex_proxy_trylock_atomic() did
+ * the same sanity checks for requeue_pi as the loop
+ * below does.
*/
- wait_for_owner_exiting(ret, exiting);
- cond_resched();
- goto retry;
- default:
- goto out_unlock;
- }
- }
-
- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
- if (task_count - nr_wake >= nr_requeue)
- break;
-
- if (!futex_match(&this->key, &key1))
- continue;
-
- /*
- * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
- * be paired with each other and no other futex ops.
- *
- * We should never be requeueing a futex_q with a pi_state,
- * which is awaiting a futex_unlock_pi().
- */
- if ((requeue_pi && !this->rt_waiter) ||
- (!requeue_pi && this->rt_waiter) ||
- this->pi_state) {
- ret = -EINVAL;
- break;
+ switch (ret) {
+ case 0:
+ /* We hold a reference on the pi state. */
+ break;
+
+ case 1:
+ /*
+ * futex_proxy_trylock_atomic() acquired the user space
+ * futex. Adjust task_count.
+ */
+ task_count++;
+ ret = 0;
+ break;
+
+ /*
+ * If the above failed, then pi_state is NULL and
+ * waiter::requeue_state is correct.
+ */
+ case -EFAULT:
+ futex_hb_waiters_dec(hb2);
+ double_unlock_hb(hb1, hb2);
+ ret = fault_in_user_writeable(uaddr2);
+ if (!ret)
+ goto retry;
+ return ret;
+ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - EBUSY: Owner is exiting and we just wait for the
+ * exit to complete.
+ * - EAGAIN: The user space value changed.
+ */
+ futex_hb_waiters_dec(hb2);
+ double_unlock_hb(hb1, hb2);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock;
+ }
}
- /* Plain futexes just wake or requeue and are done */
- if (!requeue_pi) {
- if (++task_count <= nr_wake)
- this->wake(&wake_q, this);
- else
- requeue_futex(this, hb1, hb2, &key2);
- continue;
- }
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (task_count - nr_wake >= nr_requeue)
+ break;
- /* Ensure we requeue to the expected futex for requeue_pi. */
- if (!futex_match(this->requeue_pi_key, &key2)) {
- ret = -EINVAL;
- break;
- }
+ if (!futex_match(&this->key, &key1))
+ continue;
- /*
- * Requeue nr_requeue waiters and possibly one more in the case
- * of requeue_pi if we couldn't acquire the lock atomically.
- *
- * Prepare the waiter to take the rt_mutex. Take a refcount
- * on the pi_state and store the pointer in the futex_q
- * object of the waiter.
- */
- get_pi_state(pi_state);
-
- /* Don't requeue when the waiter is already on the way out. */
- if (!futex_requeue_pi_prepare(this, pi_state)) {
/*
- * Early woken waiter signaled that it is on the
- * way out. Drop the pi_state reference and try the
- * next waiter. @this->pi_state is still NULL.
+ * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+ * be paired with each other and no other futex ops.
+ *
+ * We should never be requeueing a futex_q with a pi_state,
+ * which is awaiting a futex_unlock_pi().
*/
- put_pi_state(pi_state);
- continue;
- }
+ if ((requeue_pi && !this->rt_waiter) ||
+ (!requeue_pi && this->rt_waiter) ||
+ this->pi_state) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* Plain futexes just wake or requeue and are done */
+ if (!requeue_pi) {
+ if (++task_count <= nr_wake)
+ this->wake(&wake_q, this);
+ else
+ requeue_futex(this, hb1, hb2, &key2);
+ continue;
+ }
+
+ /* Ensure we requeue to the expected futex for requeue_pi. */
+ if (!futex_match(this->requeue_pi_key, &key2)) {
+ ret = -EINVAL;
+ break;
+ }
- ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
- this->rt_waiter,
- this->task);
-
- if (ret == 1) {
- /*
- * We got the lock. We do neither drop the refcount
- * on pi_state nor clear this->pi_state because the
- * waiter needs the pi_state for cleaning up the
- * user space value. It will drop the refcount
- * after doing so. this::requeue_state is updated
- * in the wakeup as well.
- */
- requeue_pi_wake_futex(this, &key2, hb2);
- task_count++;
- } else if (!ret) {
- /* Waiter is queued, move it to hb2 */
- requeue_futex(this, hb1, hb2, &key2);
- futex_requeue_pi_complete(this, 0);
- task_count++;
- } else {
/*
- * rt_mutex_start_proxy_lock() detected a potential
- * deadlock when we tried to queue that waiter.
- * Drop the pi_state reference which we took above
- * and remove the pointer to the state from the
- * waiters futex_q object.
+ * Requeue nr_requeue waiters and possibly one more in the case
+ * of requeue_pi if we couldn't acquire the lock atomically.
+ *
+ * Prepare the waiter to take the rt_mutex. Take a refcount
+ * on the pi_state and store the pointer in the futex_q
+ * object of the waiter.
*/
- this->pi_state = NULL;
- put_pi_state(pi_state);
- futex_requeue_pi_complete(this, ret);
- /*
- * We stop queueing more waiters and let user space
- * deal with the mess.
- */
- break;
+ get_pi_state(pi_state);
+
+ /* Don't requeue when the waiter is already on the way out. */
+ if (!futex_requeue_pi_prepare(this, pi_state)) {
+ /*
+ * Early woken waiter signaled that it is on the
+ * way out. Drop the pi_state reference and try the
+ * next waiter. @this->pi_state is still NULL.
+ */
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+ this->rt_waiter,
+ this->task);
+
+ if (ret == 1) {
+ /*
+ * We got the lock. We do neither drop the refcount
+ * on pi_state nor clear this->pi_state because the
+ * waiter needs the pi_state for cleaning up the
+ * user space value. It will drop the refcount
+ * after doing so. this::requeue_state is updated
+ * in the wakeup as well.
+ */
+ requeue_pi_wake_futex(this, &key2, hb2);
+ task_count++;
+ } else if (!ret) {
+ /* Waiter is queued, move it to hb2 */
+ requeue_futex(this, hb1, hb2, &key2);
+ futex_requeue_pi_complete(this, 0);
+ task_count++;
+ } else {
+ /*
+ * rt_mutex_start_proxy_lock() detected a potential
+ * deadlock when we tried to queue that waiter.
+ * Drop the pi_state reference which we took above
+ * and remove the pointer to the state from the
+ * waiters futex_q object.
+ */
+ this->pi_state = NULL;
+ put_pi_state(pi_state);
+ futex_requeue_pi_complete(this, ret);
+ /*
+ * We stop queueing more waiters and let user space
+ * deal with the mess.
+ */
+ break;
+ }
}
- }
- /*
- * We took an extra initial reference to the pi_state in
- * futex_proxy_trylock_atomic(). We need to drop it here again.
- */
- put_pi_state(pi_state);
+ /*
+ * We took an extra initial reference to the pi_state in
+ * futex_proxy_trylock_atomic(). We need to drop it here again.
+ */
+ put_pi_state(pi_state);
out_unlock:
- double_unlock_hb(hb1, hb2);
+ futex_hb_waiters_dec(hb2);
+ double_unlock_hb(hb1, hb2);
+ }
wake_up_q(&wake_q);
- futex_hb_waiters_dec(hb2);
return ret ? ret : task_count;
}
@@ -769,7 +780,6 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
{
struct hrtimer_sleeper timeout, *to;
struct rt_mutex_waiter rt_waiter;
- struct futex_hash_bucket *hb;
union futex_key key2 = FUTEX_KEY_INIT;
struct futex_q q = futex_q_init;
struct rt_mutex_base *pi_mutex;
@@ -805,35 +815,28 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* Prepare to wait on uaddr. On success, it holds hb->lock and q
* is initialized.
*/
- ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
+ ret = futex_wait_setup(uaddr, val, flags, &q, &key2, current);
if (ret)
goto out;
- /*
- * The check above which compares uaddrs is not sufficient for
- * shared futexes. We need to compare the keys:
- */
- if (futex_match(&q.key, &key2)) {
- futex_q_unlock(hb);
- ret = -EINVAL;
- goto out;
- }
-
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
- futex_wait_queue(hb, &q, to);
+ futex_do_wait(&q, to);
switch (futex_requeue_pi_wakeup_sync(&q)) {
case Q_REQUEUE_PI_IGNORE:
- /* The waiter is still on uaddr1 */
- spin_lock(&hb->lock);
- ret = handle_early_requeue_pi_wakeup(hb, &q, to);
- spin_unlock(&hb->lock);
+ {
+ CLASS(hb, hb)(&q.key);
+ /* The waiter is still on uaddr1 */
+ spin_lock(&hb->lock);
+ ret = handle_early_requeue_pi_wakeup(hb, &q, to);
+ spin_unlock(&hb->lock);
+ }
break;
case Q_REQUEUE_PI_LOCKED:
/* The requeue acquired the lock */
if (q.pi_state && (q.pi_state->owner != current)) {
- spin_lock(q.lock_ptr);
+ futex_q_lockptr_lock(&q);
ret = fixup_pi_owner(uaddr2, &q, true);
/*
* Drop the reference to the pi state which the
@@ -860,7 +863,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
ret = 0;
- spin_lock(q.lock_ptr);
+ futex_q_lockptr_lock(&q);
debug_rt_mutex_free_waiter(&rt_waiter);
/*
* Fixup the pi_state owner and possibly acquire the lock if we
@@ -892,6 +895,11 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
default:
BUG();
}
+ if (q.drop_hb_ref) {
+ CLASS(hb, hb)(&q.key);
+ /* Additional reference from requeue_pi_wake_futex() */
+ futex_hash_put(hb);
+ }
out:
if (to) {
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 25877d4f2f8f..e2bbe5509ec2 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -154,7 +154,6 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
*/
int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
{
- struct futex_hash_bucket *hb;
struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT;
DEFINE_WAKE_Q(wake_q);
@@ -170,7 +169,7 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if ((flags & FLAGS_STRICT) && !nr_wake)
return 0;
- hb = futex_hash(&key);
+ CLASS(hb, hb)(&key);
/* Make sure we really have tasks to wakeup */
if (!futex_hb_waiters_pending(hb))
@@ -253,7 +252,6 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
- struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
int ret, op_ret;
DEFINE_WAKE_Q(wake_q);
@@ -266,67 +264,69 @@ retry:
if (unlikely(ret != 0))
return ret;
- hb1 = futex_hash(&key1);
- hb2 = futex_hash(&key2);
-
retry_private:
- double_lock_hb(hb1, hb2);
- op_ret = futex_atomic_op_inuser(op, uaddr2);
- if (unlikely(op_ret < 0)) {
- double_unlock_hb(hb1, hb2);
-
- if (!IS_ENABLED(CONFIG_MMU) ||
- unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
- /*
- * we don't get EFAULT from MMU faults if we don't have
- * an MMU, but we might get them from range checking
- */
- ret = op_ret;
- return ret;
- }
-
- if (op_ret == -EFAULT) {
- ret = fault_in_user_writeable(uaddr2);
- if (ret)
+ if (1) {
+ CLASS(hb, hb1)(&key1);
+ CLASS(hb, hb2)(&key2);
+
+ double_lock_hb(hb1, hb2);
+ op_ret = futex_atomic_op_inuser(op, uaddr2);
+ if (unlikely(op_ret < 0)) {
+ double_unlock_hb(hb1, hb2);
+
+ if (!IS_ENABLED(CONFIG_MMU) ||
+ unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
+ /*
+ * we don't get EFAULT from MMU faults if we don't have
+ * an MMU, but we might get them from range checking
+ */
+ ret = op_ret;
return ret;
- }
-
- cond_resched();
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
- goto retry;
- }
+ }
- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
- if (futex_match (&this->key, &key1)) {
- if (this->pi_state || this->rt_waiter) {
- ret = -EINVAL;
- goto out_unlock;
+ if (op_ret == -EFAULT) {
+ ret = fault_in_user_writeable(uaddr2);
+ if (ret)
+ return ret;
}
- this->wake(&wake_q, this);
- if (++ret >= nr_wake)
- break;
+
+ cond_resched();
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+ goto retry;
}
- }
- if (op_ret > 0) {
- op_ret = 0;
- plist_for_each_entry_safe(this, next, &hb2->chain, list) {
- if (futex_match (&this->key, &key2)) {
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (futex_match(&this->key, &key1)) {
if (this->pi_state || this->rt_waiter) {
ret = -EINVAL;
goto out_unlock;
}
this->wake(&wake_q, this);
- if (++op_ret >= nr_wake2)
+ if (++ret >= nr_wake)
break;
}
}
- ret += op_ret;
- }
+
+ if (op_ret > 0) {
+ op_ret = 0;
+ plist_for_each_entry_safe(this, next, &hb2->chain, list) {
+ if (futex_match(&this->key, &key2)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ this->wake(&wake_q, this);
+ if (++op_ret >= nr_wake2)
+ break;
+ }
+ }
+ ret += op_ret;
+ }
out_unlock:
- double_unlock_hb(hb1, hb2);
+ double_unlock_hb(hb1, hb2);
+ }
wake_up_q(&wake_q);
return ret;
}
@@ -334,23 +334,12 @@ out_unlock:
static long futex_wait_restart(struct restart_block *restart);
/**
- * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal
- * @hb: the futex hash bucket, must be locked by the caller
+ * futex_do_wait() - wait for wakeup, timeout, or signal
* @q: the futex_q to queue up on
* @timeout: the prepared hrtimer_sleeper, or null for no timeout
*/
-void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
- struct hrtimer_sleeper *timeout)
+void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout)
{
- /*
- * The task state is guaranteed to be set before another task can
- * wake it. set_current_state() is implemented using smp_store_mb() and
- * futex_queue() calls spin_unlock() upon completion, both serializing
- * access to the hash list and forcing another memory barrier.
- */
- set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
- futex_queue(q, hb, current);
-
/* Arm the timer */
if (timeout)
hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
@@ -412,12 +401,17 @@ int futex_unqueue_multiple(struct futex_vector *v, int count)
*/
int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
{
- struct futex_hash_bucket *hb;
bool retry = false;
int ret, i;
u32 uval;
/*
+ * Make sure to have a reference on the private_hash such that we
+ * don't block on rehash after changing the task state below.
+ */
+ guard(private_hash)();
+
+ /*
* Enqueuing multiple futexes is tricky, because we need to enqueue
* each futex on the list before dealing with the next one to avoid
* deadlocking on the hash bucket. But, before enqueuing, we need to
@@ -451,20 +445,24 @@ retry:
struct futex_q *q = &vs[i].q;
u32 val = vs[i].w.val;
- hb = futex_q_lock(q);
- ret = futex_get_value_locked(&uval, uaddr);
+ if (1) {
+ CLASS(hb, hb)(&q->key);
- if (!ret && uval == val) {
- /*
- * The bucket lock can't be held while dealing with the
- * next futex. Queue each futex at this moment so hb can
- * be unlocked.
- */
- futex_queue(q, hb, current);
- continue;
- }
+ futex_q_lock(q, hb);
+ ret = futex_get_value_locked(&uval, uaddr);
+
+ if (!ret && uval == val) {
+ /*
+ * The bucket lock can't be held while dealing with the
+ * next futex. Queue each futex at this moment so hb can
+ * be unlocked.
+ */
+ futex_queue(q, hb, current);
+ continue;
+ }
- futex_q_unlock(hb);
+ futex_q_unlock(hb);
+ }
__set_current_state(TASK_RUNNING);
/*
@@ -578,7 +576,8 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
* @val: the expected value
* @flags: futex flags (FLAGS_SHARED, etc.)
* @q: the associated futex_q
- * @hb: storage for hash_bucket pointer to be returned to caller
+ * @key2: the second futex_key if used for requeue PI
+ * @task: Task queueing this futex
*
* Setup the futex_q and locate the hash_bucket. Get the futex value and
* compare it with the expected value. Handle atomic faults internally.
@@ -586,10 +585,12 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
*
* Return:
* - 0 - uaddr contains val and hb has been locked;
- * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
+ * - <0 - On error and the hb is unlocked. A possible reason: the uaddr can not
+ * be read, does not contain the expected value or is not properly aligned.
*/
int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
- struct futex_q *q, struct futex_hash_bucket **hb)
+ struct futex_q *q, union futex_key *key2,
+ struct task_struct *task)
{
u32 uval;
int ret;
@@ -618,26 +619,45 @@ retry:
return ret;
retry_private:
- *hb = futex_q_lock(q);
+ if (1) {
+ CLASS(hb, hb)(&q->key);
- ret = futex_get_value_locked(&uval, uaddr);
+ futex_q_lock(q, hb);
- if (ret) {
- futex_q_unlock(*hb);
+ ret = futex_get_value_locked(&uval, uaddr);
- ret = get_user(uval, uaddr);
- if (ret)
- return ret;
+ if (ret) {
+ futex_q_unlock(hb);
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
+ ret = get_user(uval, uaddr);
+ if (ret)
+ return ret;
- goto retry;
- }
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+ }
- if (uval != val) {
- futex_q_unlock(*hb);
- ret = -EWOULDBLOCK;
+ if (uval != val) {
+ futex_q_unlock(hb);
+ return -EWOULDBLOCK;
+ }
+
+ if (key2 && futex_match(&q->key, key2)) {
+ futex_q_unlock(hb);
+ return -EINVAL;
+ }
+
+ /*
+ * The task state is guaranteed to be set before another task can
+ * wake it. set_current_state() is implemented using smp_store_mb() and
+ * futex_queue() calls spin_unlock() upon completion, both serializing
+ * access to the hash list and forcing another memory barrier.
+ */
+ if (task == current)
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ futex_queue(q, hb, task);
}
return ret;
@@ -647,7 +667,6 @@ int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
struct hrtimer_sleeper *to, u32 bitset)
{
struct futex_q q = futex_q_init;
- struct futex_hash_bucket *hb;
int ret;
if (!bitset)
@@ -660,12 +679,12 @@ retry:
* Prepare to wait on uaddr. On success, it holds hb->lock and q
* is initialized.
*/
- ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
+ ret = futex_wait_setup(uaddr, val, flags, &q, NULL, current);
if (ret)
return ret;
/* futex_queue and wait for wakeup, timeout, or a signal. */
- futex_wait_queue(hb, &q, to);
+ futex_do_wait(&q, to);
/* If we were woken (and unqueued), we succeeded, whatever. */
if (!futex_unqueue(&q))
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 5c8d43cdb0a3..c05ba7ca00fa 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -761,7 +761,7 @@ static int msi_domain_translate(struct irq_domain *domain, struct irq_fwspec *fw
static void msi_domain_debug_show(struct seq_file *m, struct irq_domain *d,
struct irq_data *irqd, int ind)
{
- struct msi_desc *desc = irq_data_get_msi_desc(irqd);
+ struct msi_desc *desc = irqd ? irq_data_get_msi_desc(irqd) : NULL;
if (!desc)
return;
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index ba069459c101..2351a19ac2a9 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -29,22 +29,13 @@ static unsigned int klp_signals_cnt;
/*
* When a livepatch is in progress, enable klp stack checking in
- * cond_resched(). This helps CPU-bound kthreads get patched.
+ * schedule(). This helps CPU-bound kthreads get patched.
*/
-#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-
-#define klp_cond_resched_enable() sched_dynamic_klp_enable()
-#define klp_cond_resched_disable() sched_dynamic_klp_disable()
-
-#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
-EXPORT_SYMBOL(klp_sched_try_switch_key);
-#define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key)
-#define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key)
-
-#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+#define klp_resched_enable() static_branch_enable(&klp_sched_try_switch_key)
+#define klp_resched_disable() static_branch_disable(&klp_sched_try_switch_key)
/*
* This work can be performed periodically to finish patching or unpatching any
@@ -365,26 +356,18 @@ static bool klp_try_switch_task(struct task_struct *task)
void __klp_sched_try_switch(void)
{
- if (likely(!klp_patch_pending(current)))
- return;
-
/*
- * This function is called from cond_resched() which is called in many
- * places throughout the kernel. Using the klp_mutex here might
- * deadlock.
- *
- * Instead, disable preemption to prevent racing with other callers of
- * klp_try_switch_task(). Thanks to task_call_func() they won't be
- * able to switch this task while it's running.
+ * This function is called from __schedule() while a context switch is
+ * about to happen. Preemption is already disabled and klp_mutex
+ * can't be acquired.
+ * Disabled preemption is used to prevent racing with other callers of
+ * klp_try_switch_task(). Thanks to task_call_func() they won't be
+ * able to switch to this task while it's running.
*/
- preempt_disable();
+ lockdep_assert_preemption_disabled();
- /*
- * Make sure current didn't get patched between the above check and
- * preempt_disable().
- */
- if (unlikely(!klp_patch_pending(current)))
- goto out;
+ if (likely(!klp_patch_pending(current)))
+ return;
/*
* Enforce the order of the TIF_PATCH_PENDING read above and the
@@ -395,11 +378,7 @@ void __klp_sched_try_switch(void)
smp_rmb();
klp_try_switch_task(current);
-
-out:
- preempt_enable();
}
-EXPORT_SYMBOL(__klp_sched_try_switch);
/*
* Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
@@ -508,7 +487,7 @@ void klp_try_complete_transition(void)
}
/* Done! Now cleanup the data structures. */
- klp_cond_resched_disable();
+ klp_resched_disable();
patch = klp_transition_patch;
klp_complete_transition();
@@ -560,7 +539,7 @@ void klp_start_transition(void)
set_tsk_thread_flag(task, TIF_PATCH_PENDING);
}
- klp_cond_resched_enable();
+ klp_resched_enable();
klp_signals_cnt = 0;
}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 58d78a33ac65..dd2bbf73718b 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -219,6 +219,7 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES);
static struct hlist_head lock_keys_hash[KEYHASH_SIZE];
unsigned long nr_lock_classes;
unsigned long nr_zapped_classes;
+unsigned long nr_dynamic_keys;
unsigned long max_lock_class_idx;
struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
@@ -1238,6 +1239,7 @@ void lockdep_register_key(struct lock_class_key *key)
goto out_unlock;
}
hlist_add_head_rcu(&key->hash_entry, hash_head);
+ nr_dynamic_keys++;
out_unlock:
graph_unlock();
restore_irqs:
@@ -1977,41 +1979,6 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
}
/*
- * We are about to add A -> B into the dependency graph, and in __bfs() a
- * strong dependency path A -> .. -> B is found: hlock_class equals
- * entry->class.
- *
- * If A -> .. -> B can replace A -> B in any __bfs() search (means the former
- * is _stronger_ than or equal to the latter), we consider A -> B as redundant.
- * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A
- * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the
- * dependency graph, as any strong path ..-> A -> B ->.. we can get with
- * having dependency A -> B, we could already get a equivalent path ..-> A ->
- * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant.
- *
- * We need to make sure both the start and the end of A -> .. -> B is not
- * weaker than A -> B. For the start part, please see the comment in
- * check_redundant(). For the end part, we need:
- *
- * Either
- *
- * a) A -> B is -(*R)-> (everything is not weaker than that)
- *
- * or
- *
- * b) A -> .. -> B is -(*N)-> (nothing is stronger than this)
- *
- */
-static inline bool hlock_equal(struct lock_list *entry, void *data)
-{
- struct held_lock *hlock = (struct held_lock *)data;
-
- return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */
- (hlock->read == 2 || /* A -> B is -(*R)-> */
- !entry->only_xr); /* A -> .. -> B is -(*N)-> */
-}
-
-/*
* We are about to add B -> A into the dependency graph, and in __bfs() a
* strong dependency path A -> .. -> B is found: hlock_class equals
* entry->class.
@@ -2916,6 +2883,41 @@ static inline bool usage_skip(struct lock_list *entry, void *mask)
#ifdef CONFIG_LOCKDEP_SMALL
/*
+ * We are about to add A -> B into the dependency graph, and in __bfs() a
+ * strong dependency path A -> .. -> B is found: hlock_class equals
+ * entry->class.
+ *
+ * If A -> .. -> B can replace A -> B in any __bfs() search (means the former
+ * is _stronger_ than or equal to the latter), we consider A -> B as redundant.
+ * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A
+ * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the
+ * dependency graph, as any strong path ..-> A -> B ->.. we can get with
+ * having dependency A -> B, we could already get a equivalent path ..-> A ->
+ * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant.
+ *
+ * We need to make sure both the start and the end of A -> .. -> B is not
+ * weaker than A -> B. For the start part, please see the comment in
+ * check_redundant(). For the end part, we need:
+ *
+ * Either
+ *
+ * a) A -> B is -(*R)-> (everything is not weaker than that)
+ *
+ * or
+ *
+ * b) A -> .. -> B is -(*N)-> (nothing is stronger than this)
+ *
+ */
+static inline bool hlock_equal(struct lock_list *entry, void *data)
+{
+ struct held_lock *hlock = (struct held_lock *)data;
+
+ return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */
+ (hlock->read == 2 || /* A -> B is -(*R)-> */
+ !entry->only_xr); /* A -> .. -> B is -(*N)-> */
+}
+
+/*
* Check that the dependency graph starting at <src> can lead to
* <target> or not. If it can, <src> -> <target> dependency is already
* in the graph.
@@ -5101,6 +5103,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
lockevent_inc(lockdep_nocheck);
}
+ if (DEBUG_LOCKS_WARN_ON(subclass >= MAX_LOCKDEP_SUBCLASSES))
+ return 0;
+
if (subclass < NR_LOCKDEP_CACHING_CLASSES)
class = lock->class_cache[subclass];
/*
@@ -6606,6 +6611,7 @@ void lockdep_unregister_key(struct lock_class_key *key)
pf = get_pending_free();
__lockdep_free_key_range(pf, key, 1);
need_callback = prepare_call_rcu_zapped(pf);
+ nr_dynamic_keys--;
}
lockdep_unlock();
raw_local_irq_restore(flags);
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 20f9ef58d3d0..82156caf77d1 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -138,6 +138,7 @@ extern unsigned long nr_lock_classes;
extern unsigned long nr_zapped_classes;
extern unsigned long nr_zapped_lock_chains;
extern unsigned long nr_list_entries;
+extern unsigned long nr_dynamic_keys;
long lockdep_next_lockchain(long i);
unsigned long lock_chain_count(void);
extern unsigned long nr_stack_trace_entries;
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 6db0f43fc4df..b52c07c4707c 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -286,6 +286,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
#endif
seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
nr_lock_classes, MAX_LOCKDEP_KEYS);
+ seq_printf(m, " dynamic-keys: %11lu\n",
+ nr_dynamic_keys);
seq_printf(m, " direct dependencies: %11lu [max: %lu]\n",
nr_list_entries, MAX_LOCKDEP_ENTRIES);
seq_printf(m, " indirect dependencies: %11lu\n",
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index d6964fc29f51..ef234469baac 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -138,7 +138,8 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
return !reader; /* wake (readers until) 1 writer */
}
-static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
+static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader,
+ bool freeze)
{
DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function);
bool wait;
@@ -156,7 +157,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
spin_unlock_irq(&sem->waiters.lock);
while (wait) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE |
+ (freeze ? TASK_FREEZABLE : 0));
if (!smp_load_acquire(&wq_entry.private))
break;
schedule();
@@ -164,7 +166,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
__set_current_state(TASK_RUNNING);
}
-bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
+bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try,
+ bool freeze)
{
if (__percpu_down_read_trylock(sem))
return true;
@@ -174,7 +177,7 @@ bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ);
preempt_enable();
- percpu_rwsem_wait(sem, /* .reader = */ true);
+ percpu_rwsem_wait(sem, /* .reader = */ true, freeze);
preempt_disable();
trace_contention_end(sem, 0);
@@ -237,7 +240,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
*/
if (!__percpu_down_write_trylock(sem)) {
trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE);
- percpu_rwsem_wait(sem, /* .reader = */ false);
+ percpu_rwsem_wait(sem, /* .reader = */ false, false);
contended = true;
}
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index d7762ef5949a..39278737bb68 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -192,6 +192,11 @@ config GENDWARFKSYMS
depends on !DEBUG_INFO_REDUCED && !DEBUG_INFO_SPLIT
# Requires ELF object files.
depends on !LTO
+ # To avoid conflicts with the discarded __gendwarfksyms_ptr symbols on
+ # X86, requires pahole before commit 47dcb534e253 ("btf_encoder: Stop
+ # indexing symbols for VARs") or after commit 9810758003ce ("btf_encoder:
+ # Verify 0 address DWARF variables are in ELF section").
+ depends on !X86 || !DEBUG_INFO_BTF || PAHOLE_VERSION < 128 || PAHOLE_VERSION > 129
help
Calculate symbol versions from DWARF debugging information using
gendwarfksyms. Requires DEBUG_INFO to be enabled.
diff --git a/kernel/module/main.c b/kernel/module/main.c
index a2859dc3eea6..5c6ab20240a6 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2829,6 +2829,7 @@ static void module_deallocate(struct module *mod, struct load_info *info)
{
percpu_modfree(mod);
module_arch_freeing_init(mod);
+ codetag_free_module_sections(mod);
free_mod_mem(mod);
}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index c9d97ed20122..5f31fdff8a38 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -128,17 +128,13 @@ out_time:
out_net:
put_cgroup_ns(new_nsp->cgroup_ns);
out_cgroup:
- if (new_nsp->pid_ns_for_children)
- put_pid_ns(new_nsp->pid_ns_for_children);
+ put_pid_ns(new_nsp->pid_ns_for_children);
out_pid:
- if (new_nsp->ipc_ns)
- put_ipc_ns(new_nsp->ipc_ns);
+ put_ipc_ns(new_nsp->ipc_ns);
out_ipc:
- if (new_nsp->uts_ns)
- put_uts_ns(new_nsp->uts_ns);
+ put_uts_ns(new_nsp->uts_ns);
out_uts:
- if (new_nsp->mnt_ns)
- put_mnt_ns(new_nsp->mnt_ns);
+ put_mnt_ns(new_nsp->mnt_ns);
out_ns:
kmem_cache_free(nsproxy_cachep, new_nsp);
return ERR_PTR(err);
@@ -189,18 +185,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
void free_nsproxy(struct nsproxy *ns)
{
- if (ns->mnt_ns)
- put_mnt_ns(ns->mnt_ns);
- if (ns->uts_ns)
- put_uts_ns(ns->uts_ns);
- if (ns->ipc_ns)
- put_ipc_ns(ns->ipc_ns);
- if (ns->pid_ns_for_children)
- put_pid_ns(ns->pid_ns_for_children);
- if (ns->time_ns)
- put_time_ns(ns->time_ns);
- if (ns->time_ns_for_children)
- put_time_ns(ns->time_ns_for_children);
+ put_mnt_ns(ns->mnt_ns);
+ put_uts_ns(ns->uts_ns);
+ put_ipc_ns(ns->ipc_ns);
+ put_pid_ns(ns->pid_ns_for_children);
+ put_time_ns(ns->time_ns);
+ put_time_ns(ns->time_ns_for_children);
put_cgroup_ns(ns->cgroup_ns);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
diff --git a/kernel/padata.c b/kernel/padata.c
index b3d4eacc4f5d..7eee94166357 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -358,7 +358,8 @@ static void padata_reorder(struct parallel_data *pd)
* To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish.
*/
padata_get_pd(pd);
- queue_work(pinst->serial_wq, &pd->reorder_work);
+ if (!queue_work(pinst->serial_wq, &pd->reorder_work))
+ padata_put_pd(pd);
}
}
diff --git a/kernel/params.c b/kernel/params.c
index 2509f216c9f3..b92d64161b75 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -760,38 +760,35 @@ void destroy_params(const struct kernel_param *params, unsigned num)
params[i].ops->free(params[i].arg);
}
-static struct module_kobject * __init locate_module_kobject(const char *name)
+struct module_kobject __modinit * lookup_or_create_module_kobject(const char *name)
{
struct module_kobject *mk;
struct kobject *kobj;
int err;
kobj = kset_find_obj(module_kset, name);
- if (kobj) {
- mk = to_module_kobject(kobj);
- } else {
- mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
- BUG_ON(!mk);
-
- mk->mod = THIS_MODULE;
- mk->kobj.kset = module_kset;
- err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
- "%s", name);
-#ifdef CONFIG_MODULES
- if (!err)
- err = sysfs_create_file(&mk->kobj, &module_uevent.attr);
-#endif
- if (err) {
- kobject_put(&mk->kobj);
- pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",
- name, err);
- return NULL;
- }
+ if (kobj)
+ return to_module_kobject(kobj);
- /* So that we hold reference in both cases. */
- kobject_get(&mk->kobj);
+ mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
+ if (!mk)
+ return NULL;
+
+ mk->mod = THIS_MODULE;
+ mk->kobj.kset = module_kset;
+ err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name);
+ if (IS_ENABLED(CONFIG_MODULES) && !err)
+ err = sysfs_create_file(&mk->kobj, &module_uevent.attr);
+ if (err) {
+ kobject_put(&mk->kobj);
+ pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",
+ name, err);
+ return NULL;
}
+ /* So that we hold reference in both cases. */
+ kobject_get(&mk->kobj);
+
return mk;
}
@@ -802,7 +799,7 @@ static void __init kernel_add_sysfs_param(const char *name,
struct module_kobject *mk;
int err;
- mk = locate_module_kobject(name);
+ mk = lookup_or_create_module_kobject(name);
if (!mk)
return;
@@ -873,7 +870,7 @@ static void __init version_sysfs_builtin(void)
int err;
for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
- mk = locate_module_kobject(vattr->module_name);
+ mk = lookup_or_create_module_kobject(vattr->module_name);
if (mk) {
err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
WARN_ON_ONCE(err);
@@ -946,7 +943,9 @@ struct kset *module_kset;
static void module_kobj_release(struct kobject *kobj)
{
struct module_kobject *mk = to_module_kobject(kobj);
- complete(mk->kobj_completion);
+
+ if (mk->kobj_completion)
+ complete(mk->kobj_completion);
}
const struct kobj_type module_ktype = {
diff --git a/kernel/pid.c b/kernel/pid.c
index 4ac2ce46817f..8317bcbc7cf7 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -100,6 +100,7 @@ void put_pid(struct pid *pid)
ns = pid->numbers[pid->level].ns;
if (refcount_dec_and_test(&pid->count)) {
+ WARN_ON_ONCE(pid->stashed);
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
@@ -359,11 +360,6 @@ static void __change_pid(struct pid **pids, struct task_struct *task,
hlist_del_rcu(&task->pid_links[type]);
*pid_ptr = new;
- if (type == PIDTYPE_PID) {
- WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
- wake_up_all(&pid->wait_pidfd);
- }
-
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
if (pid_has_task(pid, tmp))
return;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 23c0f4e6cb2f..338c9917d4ee 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -778,6 +778,8 @@ int hibernate(void)
goto Restore;
ksys_sync_helper();
+ if (filesystem_freeze_enabled)
+ filesystems_freeze();
error = freeze_processes();
if (error)
@@ -846,6 +848,7 @@ int hibernate(void)
/* Don't bother checking whether freezer_test_done is true */
freezer_test_done = false;
Exit:
+ filesystems_thaw();
pm_notifier_call_chain(PM_POST_HIBERNATION);
Restore:
pm_restore_console();
@@ -882,6 +885,9 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data)
if (error)
goto restore;
+ if (filesystem_freeze_enabled)
+ filesystems_freeze();
+
error = freeze_processes();
if (error)
goto exit;
@@ -941,6 +947,7 @@ thaw:
thaw_processes();
exit:
+ filesystems_thaw();
pm_notifier_call_chain(PM_POST_HIBERNATION);
restore:
@@ -1029,19 +1036,26 @@ static int software_resume(void)
if (error)
goto Restore;
+ if (filesystem_freeze_enabled)
+ filesystems_freeze();
+
pm_pr_dbg("Preparing processes for hibernation restore.\n");
error = freeze_processes();
- if (error)
+ if (error) {
+ filesystems_thaw();
goto Close_Finish;
+ }
error = freeze_kernel_threads();
if (error) {
thaw_processes();
+ filesystems_thaw();
goto Close_Finish;
}
error = load_image_and_restore();
thaw_processes();
+ filesystems_thaw();
Finish:
pm_notifier_call_chain(PM_POST_RESTORE);
Restore:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6254814d4817..0b0e76324c43 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -962,6 +962,34 @@ power_attr(pm_freeze_timeout);
#endif /* CONFIG_FREEZER*/
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+bool filesystem_freeze_enabled = false;
+
+static ssize_t freeze_filesystems_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", filesystem_freeze_enabled);
+}
+
+static ssize_t freeze_filesystems_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ filesystem_freeze_enabled = !!val;
+ return n;
+}
+
+power_attr(freeze_filesystems);
+#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */
+
static struct attribute * g[] = {
&state_attr.attr,
#ifdef CONFIG_PM_TRACE
@@ -992,6 +1020,9 @@ static struct attribute * g[] = {
#ifdef CONFIG_FREEZER
&pm_freeze_timeout_attr.attr,
#endif
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+ &freeze_filesystems_attr.attr,
+#endif
NULL,
};
diff --git a/kernel/power/power.h b/kernel/power/power.h
index c352dea2f67b..2eb81662b8fa 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -18,6 +18,10 @@ struct swsusp_info {
unsigned long size;
} __aligned(PAGE_SIZE);
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+extern bool filesystem_freeze_enabled;
+#endif
+
#ifdef CONFIG_HIBERNATION
/* kernel/power/snapshot.c */
extern void __init hibernate_reserved_size_init(void);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8eaec4ab121d..76b141b9aac0 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -30,6 +30,7 @@
#include <trace/events/power.h>
#include <linux/compiler.h>
#include <linux/moduleparam.h>
+#include <linux/fs.h>
#include "power.h"
@@ -374,6 +375,8 @@ static int suspend_prepare(suspend_state_t state)
if (error)
goto Restore;
+ if (filesystem_freeze_enabled)
+ filesystems_freeze();
trace_suspend_resume(TPS("freeze_processes"), 0, true);
error = suspend_freeze_processes();
trace_suspend_resume(TPS("freeze_processes"), 0, false);
@@ -550,6 +553,7 @@ int suspend_devices_and_enter(suspend_state_t state)
static void suspend_finish(void)
{
suspend_thaw_processes();
+ filesystems_thaw();
pm_notifier_call_chain(PM_POST_SUSPEND);
pm_restore_console();
}
@@ -588,6 +592,8 @@ static int enter_state(suspend_state_t state)
ksys_sync_helper();
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
}
+ if (filesystem_freeze_enabled)
+ filesystems_freeze();
pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);
pm_suspend_clear_flags();
@@ -609,6 +615,7 @@ static int enter_state(suspend_state_t state)
pm_pr_dbg("Finishing wakeup.\n");
suspend_finish();
Unlock:
+ filesystems_thaw();
mutex_unlock(&system_transition_mutex);
return error;
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 80ff5f933a62..ad13c461b657 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -268,35 +268,26 @@ static void hib_end_io(struct bio *bio)
bio_put(bio);
}
-static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr,
+static int hib_submit_io_sync(blk_opf_t opf, pgoff_t page_off, void *addr)
+{
+ return bdev_rw_virt(file_bdev(hib_resume_bdev_file),
+ page_off * (PAGE_SIZE >> 9), addr, PAGE_SIZE, opf);
+}
+
+static int hib_submit_io_async(blk_opf_t opf, pgoff_t page_off, void *addr,
struct hib_bio_batch *hb)
{
- struct page *page = virt_to_page(addr);
struct bio *bio;
- int error = 0;
bio = bio_alloc(file_bdev(hib_resume_bdev_file), 1, opf,
GFP_NOIO | __GFP_HIGH);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
-
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
- pr_err("Adding page to bio failed at %llu\n",
- (unsigned long long)bio->bi_iter.bi_sector);
- bio_put(bio);
- return -EFAULT;
- }
-
- if (hb) {
- bio->bi_end_io = hib_end_io;
- bio->bi_private = hb;
- atomic_inc(&hb->count);
- submit_bio(bio);
- } else {
- error = submit_bio_wait(bio);
- bio_put(bio);
- }
-
- return error;
+ bio_add_virt_nofail(bio, addr, PAGE_SIZE);
+ bio->bi_end_io = hib_end_io;
+ bio->bi_private = hb;
+ atomic_inc(&hb->count);
+ submit_bio(bio);
+ return 0;
}
static int hib_wait_io(struct hib_bio_batch *hb)
@@ -316,7 +307,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
{
int error;
- hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL);
+ hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header);
if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
!memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
@@ -329,8 +320,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
swsusp_header->flags = flags;
if (flags & SF_CRC32_MODE)
swsusp_header->crc32 = handle->crc32;
- error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC,
- swsusp_resume_block, swsusp_header, NULL);
+ error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC,
+ swsusp_resume_block, swsusp_header);
} else {
pr_err("Swap header not found!\n");
error = -ENODEV;
@@ -380,36 +371,30 @@ static int swsusp_swap_check(void)
static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
{
+ gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
void *src;
int ret;
if (!offset)
return -ENOSPC;
- if (hb) {
- src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
- __GFP_NORETRY);
- if (src) {
- copy_page(src, buf);
- } else {
- ret = hib_wait_io(hb); /* Free pages */
- if (ret)
- return ret;
- src = (void *)__get_free_page(GFP_NOIO |
- __GFP_NOWARN |
- __GFP_NORETRY);
- if (src) {
- copy_page(src, buf);
- } else {
- WARN_ON_ONCE(1);
- hb = NULL; /* Go synchronous */
- src = buf;
- }
- }
- } else {
- src = buf;
+ if (!hb)
+ goto sync_io;
+
+ src = (void *)__get_free_page(gfp);
+ if (!src) {
+ ret = hib_wait_io(hb); /* Free pages */
+ if (ret)
+ return ret;
+ src = (void *)__get_free_page(gfp);
+ if (WARN_ON_ONCE(!src))
+ goto sync_io;
}
- return hib_submit_io(REQ_OP_WRITE | REQ_SYNC, offset, src, hb);
+
+ copy_page(src, buf);
+ return hib_submit_io_async(REQ_OP_WRITE | REQ_SYNC, offset, src, hb);
+sync_io:
+ return hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, offset, buf);
}
static void release_swap_writer(struct swap_map_handle *handle)
@@ -1041,7 +1026,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
return -ENOMEM;
}
- error = hib_submit_io(REQ_OP_READ, offset, tmp->map, NULL);
+ error = hib_submit_io_sync(REQ_OP_READ, offset, tmp->map);
if (error) {
release_swap_reader(handle);
return error;
@@ -1065,7 +1050,10 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
offset = handle->cur->entries[handle->k];
if (!offset)
return -EFAULT;
- error = hib_submit_io(REQ_OP_READ, offset, buf, hb);
+ if (hb)
+ error = hib_submit_io_async(REQ_OP_READ, offset, buf, hb);
+ else
+ error = hib_submit_io_sync(REQ_OP_READ, offset, buf);
if (error)
return error;
if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -1590,8 +1578,8 @@ int swsusp_check(bool exclusive)
BLK_OPEN_READ, holder, NULL);
if (!IS_ERR(hib_resume_bdev_file)) {
clear_page(swsusp_header);
- error = hib_submit_io(REQ_OP_READ, swsusp_resume_block,
- swsusp_header, NULL);
+ error = hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block,
+ swsusp_header);
if (error)
goto put;
@@ -1599,9 +1587,9 @@ int swsusp_check(bool exclusive)
memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
swsusp_header_flags = swsusp_header->flags;
/* Reset swap signature now */
- error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC,
+ error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC,
swsusp_resume_block,
- swsusp_header, NULL);
+ swsusp_header);
} else {
error = -EINVAL;
}
@@ -1650,13 +1638,12 @@ int swsusp_unmark(void)
{
int error;
- hib_submit_io(REQ_OP_READ, swsusp_resume_block,
- swsusp_header, NULL);
+ hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header);
if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
- error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC,
+ error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC,
swsusp_resume_block,
- swsusp_header, NULL);
+ swsusp_header);
} else {
pr_err("Cannot find swsusp signature!\n");
error = -ENODEV;
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index eed2951a4962..9cf01832a6c3 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -57,6 +57,9 @@
/* Low-order bit definition for polled grace-period APIs. */
#define RCU_GET_STATE_COMPLETED 0x1
+/* A complete grace period count */
+#define RCU_SEQ_GP (RCU_SEQ_STATE_MASK + 1)
+
extern int sysctl_sched_rt_runtime;
/*
@@ -157,12 +160,21 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s)
* Given a snapshot from rcu_seq_snap(), determine whether or not a
* full update-side operation has occurred, but do not allow the
* (ULONG_MAX / 2) safety-factor/guard-band.
+ *
+ * The token returned by get_state_synchronize_rcu_full() is based on
+ * rcu_state.gp_seq but it is tested in poll_state_synchronize_rcu_full()
+ * against the root rnp->gp_seq. Since rcu_seq_start() is first called
+ * on rcu_state.gp_seq and only later reflected on the root rnp->gp_seq,
+ * it is possible that rcu_seq_snap(rcu_state.gp_seq) returns 2 full grace
+ * periods ahead of the root rnp->gp_seq. To prevent false-positives with the
+ * full polling API that a wrap around instantly completed the GP, when nothing
+ * like that happened, adjust for the 2 GPs in the ULONG_CMP_LT().
*/
static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s)
{
unsigned long cur_s = READ_ONCE(*sp);
- return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (3 * RCU_SEQ_STATE_MASK + 1));
+ return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_GP));
}
/*
@@ -572,6 +584,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
unsigned long c_old,
unsigned long c);
void rcu_gp_set_torture_wait(int duration);
+void rcu_set_gpwrap_lag(unsigned long lag);
+int rcu_get_gpwrap_count(int cpu);
#else
static inline void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq)
{
@@ -589,6 +603,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
do { } while (0)
#endif
static inline void rcu_gp_set_torture_wait(int duration) { }
+static inline void rcu_set_gpwrap_lag(unsigned long lag) { }
+static inline int rcu_get_gpwrap_count(int cpu) { return 0; }
#endif
unsigned long long rcutorture_gather_gp_seqs(void);
void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len);
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 0f3059b1b80d..b521d0455992 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -762,7 +762,7 @@ kfree_scale_thread(void *arg)
}
for (i = 0; i < kfree_alloc_num; i++) {
- alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL);
+ alloc_ptr = kcalloc(kfree_mult, sizeof(struct kfree_obj), GFP_KERNEL);
if (!alloc_ptr)
return -ENOMEM;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 4fa7772be183..b5db1ac32a3d 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -115,6 +115,10 @@ torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable");
+torture_param(bool, gpwrap_lag, true, "Enable grace-period wrap lag testing");
+torture_param(int, gpwrap_lag_gps, 8, "Value to set for set_gpwrap_lag during an active testing period.");
+torture_param(int, gpwrap_lag_cycle_mins, 30, "Total cycle duration for gpwrap lag testing (in minutes)");
+torture_param(int, gpwrap_lag_active_mins, 5, "Duration for which gpwrap lag is active within each cycle (in minutes)");
torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable");
torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)");
torture_param(int, preempt_duration, 0, "Preemption duration (ms), zero to disable");
@@ -413,6 +417,8 @@ struct rcu_torture_ops {
bool (*reader_blocked)(void);
unsigned long long (*gather_gp_seqs)(void);
void (*format_gp_seqs)(unsigned long long seqs, char *cp, size_t len);
+ void (*set_gpwrap_lag)(unsigned long lag);
+ int (*get_gpwrap_count)(int cpu);
long cbflood_max;
int irq_capable;
int can_boost;
@@ -619,6 +625,8 @@ static struct rcu_torture_ops rcu_ops = {
: NULL,
.gather_gp_seqs = rcutorture_gather_gp_seqs,
.format_gp_seqs = rcutorture_format_gp_seqs,
+ .set_gpwrap_lag = rcu_set_gpwrap_lag,
+ .get_gpwrap_count = rcu_get_gpwrap_count,
.irq_capable = 1,
.can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
.extendables = RCUTORTURE_MAX_EXTEND,
@@ -2164,53 +2172,70 @@ rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_sta
return &rtrsp[j];
}
-/*
- * Do one read-side critical section, returning false if there was
- * no data to read. Can be invoked both from process context and
- * from a timer handler.
- */
-static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
-{
- bool checkpolling = !(torture_random(trsp) & 0xfff);
+struct rcu_torture_one_read_state {
+ bool checkpolling;
unsigned long cookie;
struct rcu_gp_oldstate cookie_full;
- int i;
unsigned long started;
- unsigned long completed;
- int newstate;
struct rcu_torture *p;
- int pipe_count;
- bool preempted = false;
- int readstate = 0;
- struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS] = { { 0 } };
- struct rt_read_seg *rtrsp = &rtseg[0];
- struct rt_read_seg *rtrsp1;
+ int readstate;
+ struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS];
+ struct rt_read_seg *rtrsp;
unsigned long long ts;
+};
- WARN_ON_ONCE(!rcu_is_watching());
- newstate = rcutorture_extend_mask(readstate, trsp);
- rcutorture_one_extend(&readstate, newstate, myid < 0, trsp, rtrsp++);
- if (checkpolling) {
+static void init_rcu_torture_one_read_state(struct rcu_torture_one_read_state *rtorsp,
+ struct torture_random_state *trsp)
+{
+ memset(rtorsp, 0, sizeof(*rtorsp));
+ rtorsp->checkpolling = !(torture_random(trsp) & 0xfff);
+ rtorsp->rtrsp = &rtorsp->rtseg[0];
+}
+
+/*
+ * Set up the first segment of a series of overlapping read-side
+ * critical sections. The caller must have actually initiated the
+ * outermost read-side critical section.
+ */
+static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp,
+ struct torture_random_state *trsp, long myid)
+{
+ if (rtorsp->checkpolling) {
if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- cookie = cur_ops->get_gp_state();
+ rtorsp->cookie = cur_ops->get_gp_state();
if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
- cur_ops->get_gp_state_full(&cookie_full);
+ cur_ops->get_gp_state_full(&rtorsp->cookie_full);
}
- started = cur_ops->get_gp_seq();
- ts = rcu_trace_clock_local();
- p = rcu_dereference_check(rcu_torture_current,
+ rtorsp->started = cur_ops->get_gp_seq();
+ rtorsp->ts = rcu_trace_clock_local();
+ rtorsp->p = rcu_dereference_check(rcu_torture_current,
!cur_ops->readlock_held || cur_ops->readlock_held());
- if (p == NULL) {
+ if (rtorsp->p == NULL) {
/* Wait for rcu_torture_writer to get underway */
- rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp);
+ rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp);
return false;
}
- if (p->rtort_mbtest == 0)
+ if (rtorsp->p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
- rcu_torture_reader_do_mbchk(myid, p, trsp);
- rtrsp = rcutorture_loop_extend(&readstate, myid < 0, trsp, rtrsp);
+ rcu_torture_reader_do_mbchk(myid, rtorsp->p, trsp);
+ return true;
+}
+
+/*
+ * Complete the last segment of a series of overlapping read-side
+ * critical sections and check for errors.
+ */
+static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp,
+ struct torture_random_state *trsp, long myid)
+{
+ int i;
+ unsigned long completed;
+ int pipe_count;
+ bool preempted = false;
+ struct rt_read_seg *rtrsp1;
+
preempt_disable();
- pipe_count = READ_ONCE(p->rtort_pipe_count);
+ pipe_count = READ_ONCE(rtorsp->p->rtort_pipe_count);
if (pipe_count > RCU_TORTURE_PIPE_LEN) {
// Should not happen in a correct RCU implementation,
// happens quite often for torture_type=busted.
@@ -2218,28 +2243,28 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
}
completed = cur_ops->get_gp_seq();
if (pipe_count > 1) {
- do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
- ts, started, completed);
+ do_trace_rcu_torture_read(cur_ops->name, &rtorsp->p->rtort_rcu,
+ rtorsp->ts, rtorsp->started, completed);
rcu_ftrace_dump(DUMP_ALL);
}
__this_cpu_inc(rcu_torture_count[pipe_count]);
- completed = rcutorture_seq_diff(completed, started);
+ completed = rcutorture_seq_diff(completed, rtorsp->started);
if (completed > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
completed = RCU_TORTURE_PIPE_LEN;
}
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
- if (checkpolling) {
+ if (rtorsp->checkpolling) {
if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- WARN_ONCE(cur_ops->poll_gp_state(cookie),
+ WARN_ONCE(cur_ops->poll_gp_state(rtorsp->cookie),
"%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
__func__,
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
- cookie, cur_ops->get_gp_state());
+ rtorsp->cookie, cur_ops->get_gp_state());
if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
- WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
+ WARN_ONCE(cur_ops->poll_gp_state_full(&rtorsp->cookie_full),
"%s: Cookie check 6 failed %s(%d) online %*pbl\n",
__func__,
rcu_torture_writer_state_getname(),
@@ -2248,21 +2273,42 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
}
if (cur_ops->reader_blocked)
preempted = cur_ops->reader_blocked();
- rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp);
- WARN_ON_ONCE(readstate);
+ rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp);
+ WARN_ON_ONCE(rtorsp->readstate);
// This next splat is expected behavior if leakpointer, especially
// for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels.
- WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1);
+ WARN_ON_ONCE(leakpointer && READ_ONCE(rtorsp->p->rtort_pipe_count) > 1);
/* If error or close call, record the sequence of reader protections. */
if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) {
i = 0;
- for (rtrsp1 = &rtseg[0]; rtrsp1 < rtrsp; rtrsp1++)
+ for (rtrsp1 = &rtorsp->rtseg[0]; rtrsp1 < rtorsp->rtrsp; rtrsp1++)
err_segs[i++] = *rtrsp1;
rt_read_nsegs = i;
rt_read_preempted = preempted;
}
+}
+/*
+ * Do one read-side critical section, returning false if there was
+ * no data to read. Can be invoked both from process context and
+ * from a timer handler.
+ */
+static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
+{
+ int newstate;
+ struct rcu_torture_one_read_state rtors;
+
+ WARN_ON_ONCE(!rcu_is_watching());
+ init_rcu_torture_one_read_state(&rtors, trsp);
+ newstate = rcutorture_extend_mask(rtors.readstate, trsp);
+ rcutorture_one_extend(&rtors.readstate, newstate, myid < 0, trsp, rtors.rtrsp++);
+ if (!rcu_torture_one_read_start(&rtors, trsp, myid)) {
+ rcutorture_one_extend(&rtors.readstate, 0, myid < 0, trsp, rtors.rtrsp);
+ return false;
+ }
+ rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, myid < 0, trsp, rtors.rtrsp);
+ rcu_torture_one_read_end(&rtors, trsp, myid);
return true;
}
@@ -2307,7 +2353,7 @@ rcu_torture_reader(void *arg)
set_user_nice(current, MAX_NICE);
if (irqreader && cur_ops->irq_capable)
timer_setup_on_stack(&t, rcu_torture_timer, 0);
- tick_dep_set_task(current, TICK_DEP_BIT_RCU);
+ tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick.
do {
if (irqreader && cur_ops->irq_capable) {
if (!timer_pending(&t))
@@ -2394,6 +2440,7 @@ rcu_torture_stats_print(void)
int i;
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+ long n_gpwraps = 0;
struct rcu_torture *rtcp;
static unsigned long rtcv_snap = ULONG_MAX;
static bool splatted;
@@ -2404,6 +2451,8 @@ rcu_torture_stats_print(void)
pipesummary[i] += READ_ONCE(per_cpu(rcu_torture_count, cpu)[i]);
batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]);
}
+ if (cur_ops->get_gpwrap_count)
+ n_gpwraps += cur_ops->get_gpwrap_count(cpu);
}
for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) {
if (pipesummary[i] != 0)
@@ -2435,8 +2484,9 @@ rcu_torture_stats_print(void)
data_race(n_barrier_attempts),
data_race(n_rcu_torture_barrier_error));
pr_cont("read-exits: %ld ", data_race(n_read_exits)); // Statistic.
- pr_cont("nocb-toggles: %ld:%ld\n",
+ pr_cont("nocb-toggles: %ld:%ld ",
atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload));
+ pr_cont("gpwraps: %ld\n", n_gpwraps);
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
if (atomic_read(&n_rcu_torture_mberror) ||
@@ -3036,7 +3086,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
cver = READ_ONCE(rcu_torture_current_version);
gps = cur_ops->get_gp_seq();
rfp->rcu_launder_gp_seq_start = gps;
- tick_dep_set_task(current, TICK_DEP_BIT_RCU);
+ tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick.
while (time_before(jiffies, stopat) &&
!shutdown_time_arrived() &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
@@ -3607,6 +3657,57 @@ static int rcu_torture_preempt(void *unused)
static enum cpuhp_state rcutor_hp;
+static struct hrtimer gpwrap_lag_timer;
+static bool gpwrap_lag_active;
+
+/* Timer handler for toggling RCU grace-period sequence overflow test lag value */
+static enum hrtimer_restart rcu_gpwrap_lag_timer(struct hrtimer *timer)
+{
+ ktime_t next_delay;
+
+ if (gpwrap_lag_active) {
+ pr_alert("rcu-torture: Disabling gpwrap lag (value=0)\n");
+ cur_ops->set_gpwrap_lag(0);
+ gpwrap_lag_active = false;
+ next_delay = ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0);
+ } else {
+ pr_alert("rcu-torture: Enabling gpwrap lag (value=%d)\n", gpwrap_lag_gps);
+ cur_ops->set_gpwrap_lag(gpwrap_lag_gps);
+ gpwrap_lag_active = true;
+ next_delay = ktime_set(gpwrap_lag_active_mins * 60, 0);
+ }
+
+ if (torture_must_stop_irq())
+ return HRTIMER_NORESTART;
+
+ hrtimer_forward_now(timer, next_delay);
+ return HRTIMER_RESTART;
+}
+
+static int rcu_gpwrap_lag_init(void)
+{
+ if (!gpwrap_lag)
+ return 0;
+
+ if (gpwrap_lag_cycle_mins <= 0 || gpwrap_lag_active_mins <= 0) {
+ pr_alert("rcu-torture: lag timing parameters must be positive\n");
+ return -EINVAL;
+ }
+
+ hrtimer_setup(&gpwrap_lag_timer, rcu_gpwrap_lag_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ gpwrap_lag_active = false;
+ hrtimer_start(&gpwrap_lag_timer,
+ ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0), HRTIMER_MODE_REL);
+
+ return 0;
+}
+
+static void rcu_gpwrap_lag_cleanup(void)
+{
+ hrtimer_cancel(&gpwrap_lag_timer);
+ cur_ops->set_gpwrap_lag(0);
+ gpwrap_lag_active = false;
+}
static void
rcu_torture_cleanup(void)
{
@@ -3776,6 +3877,9 @@ rcu_torture_cleanup(void)
torture_cleanup_end();
if (cur_ops->gp_slow_unregister)
cur_ops->gp_slow_unregister(NULL);
+
+ if (gpwrap_lag && cur_ops->set_gpwrap_lag)
+ rcu_gpwrap_lag_cleanup();
}
static void rcu_torture_leak_cb(struct rcu_head *rhp)
@@ -4272,9 +4376,17 @@ rcu_torture_init(void)
}
if (object_debug)
rcu_test_debug_objects();
- torture_init_end();
+
if (cur_ops->gp_slow_register && !WARN_ON_ONCE(!cur_ops->gp_slow_unregister))
cur_ops->gp_slow_register(&rcu_fwd_cb_nodelay);
+
+ if (gpwrap_lag && cur_ops->set_gpwrap_lag) {
+ firsterr = rcu_gpwrap_lag_init();
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+
+ torture_init_end();
return 0;
unwind:
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 9a59b071501b..48047260697e 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1589,7 +1589,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
{
if (cookie != SRCU_GET_STATE_COMPLETED &&
- !rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie))
+ !rcu_seq_done_exact(&ssp->srcu_sup->srcu_gp_seq, cookie))
return false;
// Ensure that the end of the SRCU grace period happens before
// any subsequent code that the caller might execute.
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 659f83e71048..e8a4b720d7d2 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -80,6 +80,15 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.gpwrap = true,
};
+
+int rcu_get_gpwrap_count(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ return READ_ONCE(rdp->gpwrap_count);
+}
+EXPORT_SYMBOL_GPL(rcu_get_gpwrap_count);
+
static struct rcu_state rcu_state = {
.level = { &rcu_state.node[0] },
.gp_state = RCU_GP_IDLE,
@@ -757,6 +766,25 @@ void rcu_request_urgent_qs_task(struct task_struct *t)
smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);
}
+static unsigned long seq_gpwrap_lag = ULONG_MAX / 4;
+
+/**
+ * rcu_set_gpwrap_lag - Set RCU GP sequence overflow lag value.
+ * @lag_gps: Set overflow lag to this many grace period worth of counters
+ * which is used by rcutorture to quickly force a gpwrap situation.
+ * @lag_gps = 0 means we reset it back to the boot-time value.
+ */
+void rcu_set_gpwrap_lag(unsigned long lag_gps)
+{
+ unsigned long lag_seq_count;
+
+ lag_seq_count = (lag_gps == 0)
+ ? ULONG_MAX / 4
+ : lag_gps << RCU_SEQ_CTR_SHIFT;
+ WRITE_ONCE(seq_gpwrap_lag, lag_seq_count);
+}
+EXPORT_SYMBOL_GPL(rcu_set_gpwrap_lag);
+
/*
* When trying to report a quiescent state on behalf of some other CPU,
* it is our responsibility to check for and handle potential overflow
@@ -767,9 +795,11 @@ void rcu_request_urgent_qs_task(struct task_struct *t)
static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
{
raw_lockdep_assert_held_rcu_node(rnp);
- if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4,
- rnp->gp_seq))
+ if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + seq_gpwrap_lag,
+ rnp->gp_seq)) {
WRITE_ONCE(rdp->gpwrap, true);
+ WRITE_ONCE(rdp->gpwrap_count, READ_ONCE(rdp->gpwrap_count) + 1);
+ }
if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq))
rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4;
}
@@ -801,6 +831,10 @@ static int rcu_watching_snap_save(struct rcu_data *rdp)
return 0;
}
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+
/*
* Returns positive if the specified CPU has passed through a quiescent state
* by virtue of being in or having passed through an dynticks idle state since
@@ -936,9 +970,9 @@ static int rcu_watching_snap_recheck(struct rcu_data *rdp)
rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
- rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu);
- rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu);
- rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu);
+ rsrp->nr_hardirqs = kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu);
+ rsrp->nr_softirqs = kstat_cpu_softirqs_sum(cpu);
+ rsrp->nr_csw = nr_context_switches_cpu(cpu);
rsrp->jiffies = jiffies;
rsrp->gp_seq = rdp->gp_seq;
}
@@ -1060,38 +1094,6 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
return needmore;
}
-static void swake_up_one_online_ipi(void *arg)
-{
- struct swait_queue_head *wqh = arg;
-
- swake_up_one(wqh);
-}
-
-static void swake_up_one_online(struct swait_queue_head *wqh)
-{
- int cpu = get_cpu();
-
- /*
- * If called from rcutree_report_cpu_starting(), wake up
- * is dangerous that late in the CPU-down hotplug process. The
- * scheduler might queue an ignored hrtimer. Defer the wake up
- * to an online CPU instead.
- */
- if (unlikely(cpu_is_offline(cpu))) {
- int target;
-
- target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU),
- cpu_online_mask);
-
- smp_call_function_single(target, swake_up_one_online_ipi,
- wqh, 0);
- put_cpu();
- } else {
- put_cpu();
- swake_up_one(wqh);
- }
-}
-
/*
* Awaken the grace-period kthread. Don't do a self-awaken (unless in an
* interrupt or softirq handler, in which case we just might immediately
@@ -1116,7 +1118,7 @@ static void rcu_gp_kthread_wake(void)
return;
WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
- swake_up_one_online(&rcu_state.gp_wq);
+ swake_up_one(&rcu_state.gp_wq);
}
/*
@@ -1798,6 +1800,7 @@ static noinline_for_stack bool rcu_gp_init(void)
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root();
bool start_new_poll;
+ unsigned long old_gp_seq;
WRITE_ONCE(rcu_state.gp_activity, jiffies);
raw_spin_lock_irq_rcu_node(rnp);
@@ -1825,7 +1828,12 @@ static noinline_for_stack bool rcu_gp_init(void)
*/
start_new_poll = rcu_sr_normal_gp_init();
/* Record GP times before starting GP, hence rcu_seq_start(). */
+ old_gp_seq = rcu_state.gp_seq;
rcu_seq_start(&rcu_state.gp_seq);
+ /* Ensure that rcu_seq_done_exact() guardband doesn't give false positives. */
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) &&
+ rcu_seq_done_exact(&old_gp_seq, rcu_seq_snap(&rcu_state.gp_seq)));
+
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index a9a811d9d7a3..3830c19cf2f6 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -168,7 +168,7 @@ struct rcu_snap_record {
u64 cputime_irq; /* Accumulated cputime of hard irqs */
u64 cputime_softirq;/* Accumulated cputime of soft irqs */
u64 cputime_system; /* Accumulated cputime of kernel tasks */
- unsigned long nr_hardirqs; /* Accumulated number of hard irqs */
+ u64 nr_hardirqs; /* Accumulated number of hard irqs */
unsigned int nr_softirqs; /* Accumulated number of soft irqs */
unsigned long long nr_csw; /* Accumulated number of task switches */
unsigned long jiffies; /* Track jiffies value */
@@ -183,6 +183,7 @@ struct rcu_data {
bool core_needs_qs; /* Core waits for quiescent state. */
bool beenonline; /* CPU online at least once. */
bool gpwrap; /* Possible ->gp_seq wrap. */
+ unsigned int gpwrap_count; /* Count of GP sequence wrap. */
bool cpu_started; /* RCU watching this onlining CPU. */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 8d4895c854c5..c36c7d5575ca 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -200,7 +200,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
if (rnp->parent == NULL) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (wake)
- swake_up_one_online(&rcu_state.expedited_wq);
+ swake_up_one(&rcu_state.expedited_wq);
break;
}
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index fa269d34167a..1596812f7f12 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -216,7 +216,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
if (needwake) {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
- swake_up_one_online(&rdp_gp->nocb_gp_wq);
+ swake_up_one(&rdp_gp->nocb_gp_wq);
}
return needwake;
@@ -554,19 +554,13 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
rcu_nocb_unlock(rdp);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
TPS("WakeLazy"));
- } else if (!irqs_disabled_flags(flags) && cpu_online(rdp->cpu)) {
+ } else if (!irqs_disabled_flags(flags)) {
/* ... if queue was empty ... */
rcu_nocb_unlock(rdp);
wake_nocb_gp(rdp, false);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- /*
- * Don't do the wake-up upfront on fragile paths.
- * Also offline CPUs can't call swake_up_one_online() from
- * (soft-)IRQs. Rely on the final deferred wake-up from
- * rcutree_report_cpu_dead()
- */
rcu_nocb_unlock(rdp);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
TPS("WakeEmptyIsDeferred"));
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3c0bbbbb686f..0b0f56f6abc8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -29,7 +29,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
(IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) ||
lockdep_is_held(&rdp->nocb_lock) ||
lockdep_is_held(&rcu_state.nocb_mutex) ||
- (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) &&
+ ((!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) || softirq_count()) &&
rdp == this_cpu_ptr(&rcu_data)) ||
rcu_current_is_nocb_kthread(rdp)),
"Unsafe read of RCU_NOCB offloaded state"
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 925fcdad5dea..56b21219442b 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -435,8 +435,8 @@ static void print_cpu_stat_info(int cpu)
rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
pr_err("\t hardirqs softirqs csw/system\n");
- pr_err("\t number: %8ld %10d %12lld\n",
- kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs,
+ pr_err("\t number: %8lld %10d %12lld\n",
+ kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu) - rsrp->nr_hardirqs,
kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs,
nr_context_switches_cpu(cpu) - rsrp->nr_csw);
pr_err("\tcputime: %8lld %10lld %12lld ==> %d(ms)\n",
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c81cf642dba0..62b3416f5e43 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -66,6 +66,7 @@
#include <linux/vtime.h>
#include <linux/wait_api.h>
#include <linux/workqueue_api.h>
+#include <linux/livepatch_sched.h>
#ifdef CONFIG_PREEMPT_DYNAMIC
# ifdef CONFIG_GENERIC_ENTRY
@@ -1752,7 +1753,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
}
}
-static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags)
{
enum uclamp_id clamp_id;
@@ -1768,7 +1769,8 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
if (unlikely(!p->sched_class->uclamp_enabled))
return;
- if (p->se.sched_delayed)
+ /* Only inc the delayed task which being woken up. */
+ if (p->se.sched_delayed && !(flags & ENQUEUE_DELAYED))
return;
for_each_clamp_id(clamp_id)
@@ -2036,7 +2038,7 @@ static void __init init_uclamp(void)
}
#else /* !CONFIG_UCLAMP_TASK */
-static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { }
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
static inline void uclamp_fork(struct task_struct *p) { }
static inline void uclamp_post_fork(struct task_struct *p) { }
@@ -2072,12 +2074,14 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
- p->sched_class->enqueue_task(rq, p, flags);
/*
- * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
- * ->sched_delayed.
+ * Can be before ->enqueue_task() because uclamp considers the
+ * ENQUEUE_DELAYED task before its ->sched_delayed gets cleared
+ * in ->enqueue_task().
*/
- uclamp_rq_inc(rq, p);
+ uclamp_rq_inc(rq, p, flags);
+
+ p->sched_class->enqueue_task(rq, p, flags);
psi_enqueue(p, flags);
@@ -2283,6 +2287,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
* just go back and repeat.
*/
rq = task_rq_lock(p, &rf);
+ /*
+ * If task is sched_delayed, force dequeue it, to avoid always
+ * hitting the tick timeout in the queued case
+ */
+ if (p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
trace_sched_wait_task(p);
running = task_on_cpu(rq, p);
queued = task_on_rq_queued(p);
@@ -6571,12 +6581,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* Otherwise marks the task's __state as RUNNING
*/
static bool try_to_block_task(struct rq *rq, struct task_struct *p,
- unsigned long task_state)
+ unsigned long *task_state_p)
{
+ unsigned long task_state = *task_state_p;
int flags = DEQUEUE_NOCLOCK;
if (signal_pending_state(task_state, p)) {
WRITE_ONCE(p->__state, TASK_RUNNING);
+ *task_state_p = TASK_RUNNING;
return false;
}
@@ -6668,6 +6680,8 @@ static void __sched notrace __schedule(int sched_mode)
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
+ klp_sched_try_switch(prev);
+
local_irq_disable();
rcu_note_context_switch(preempt);
@@ -6713,7 +6727,7 @@ static void __sched notrace __schedule(int sched_mode)
goto picked;
}
} else if (!preempt && prev_state) {
- try_to_block_task(rq, prev, prev_state);
+ try_to_block_task(rq, prev, &prev_state);
switch_count = &prev->nvcsw;
}
@@ -7328,7 +7342,6 @@ EXPORT_STATIC_CALL_TRAMP(might_resched);
static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
int __sched dynamic_cond_resched(void)
{
- klp_sched_try_switch();
if (!static_branch_unlikely(&sk_dynamic_cond_resched))
return 0;
return __cond_resched();
@@ -7500,7 +7513,6 @@ int sched_dynamic_mode(const char *str)
#endif
static DEFINE_MUTEX(sched_dynamic_mutex);
-static bool klp_override;
static void __sched_dynamic_update(int mode)
{
@@ -7508,8 +7520,7 @@ static void __sched_dynamic_update(int mode)
* Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
* the ZERO state, which is invalid.
*/
- if (!klp_override)
- preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_enable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
@@ -7518,8 +7529,7 @@ static void __sched_dynamic_update(int mode)
switch (mode) {
case preempt_dynamic_none:
- if (!klp_override)
- preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
@@ -7530,8 +7540,7 @@ static void __sched_dynamic_update(int mode)
break;
case preempt_dynamic_voluntary:
- if (!klp_override)
- preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_enable(might_resched);
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
@@ -7542,8 +7551,7 @@ static void __sched_dynamic_update(int mode)
break;
case preempt_dynamic_full:
- if (!klp_override)
- preempt_dynamic_disable(cond_resched);
+ preempt_dynamic_disable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
@@ -7554,8 +7562,7 @@ static void __sched_dynamic_update(int mode)
break;
case preempt_dynamic_lazy:
- if (!klp_override)
- preempt_dynamic_disable(cond_resched);
+ preempt_dynamic_disable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
@@ -7576,36 +7583,6 @@ void sched_dynamic_update(int mode)
mutex_unlock(&sched_dynamic_mutex);
}
-#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
-
-static int klp_cond_resched(void)
-{
- __klp_sched_try_switch();
- return __cond_resched();
-}
-
-void sched_dynamic_klp_enable(void)
-{
- mutex_lock(&sched_dynamic_mutex);
-
- klp_override = true;
- static_call_update(cond_resched, klp_cond_resched);
-
- mutex_unlock(&sched_dynamic_mutex);
-}
-
-void sched_dynamic_klp_disable(void)
-{
- mutex_lock(&sched_dynamic_mutex);
-
- klp_override = false;
- __sched_dynamic_update(preempt_dynamic_mode);
-
- mutex_unlock(&sched_dynamic_mutex);
-}
-
-#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
-
static int __init setup_preempt_mode(char *str)
{
int mode = sched_dynamic_mode(str);
@@ -9018,7 +8995,7 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
unsigned long flags;
spin_lock_irqsave(&task_group_lock, flags);
- list_add_rcu(&tg->list, &task_groups);
+ list_add_tail_rcu(&tg->list, &task_groups);
/* Root should already exist: */
WARN_ON(!parent);
@@ -9204,11 +9181,15 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
struct task_struct *task;
struct cgroup_subsys_state *css;
+ if (!rt_group_sched_enabled())
+ goto scx_check;
+
cgroup_taskset_for_each(task, css, tset) {
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
}
-#endif
+scx_check:
+#endif /* CONFIG_RT_GROUP_SCHED */
return scx_cgroup_can_attach(tset);
}
@@ -9861,18 +9842,6 @@ static struct cftype cpu_legacy_files[] = {
.seq_show = cpu_cfs_local_stat_show,
},
#endif
-#ifdef CONFIG_RT_GROUP_SCHED
- {
- .name = "rt_runtime_us",
- .read_s64 = cpu_rt_runtime_read,
- .write_s64 = cpu_rt_runtime_write,
- },
- {
- .name = "rt_period_us",
- .read_u64 = cpu_rt_period_read_uint,
- .write_u64 = cpu_rt_period_write_uint,
- },
-#endif
#ifdef CONFIG_UCLAMP_TASK_GROUP
{
.name = "uclamp.min",
@@ -9890,6 +9859,55 @@ static struct cftype cpu_legacy_files[] = {
{ } /* Terminate */
};
+#ifdef CONFIG_RT_GROUP_SCHED
+static struct cftype rt_group_files[] = {
+ {
+ .name = "rt_runtime_us",
+ .read_s64 = cpu_rt_runtime_read,
+ .write_s64 = cpu_rt_runtime_write,
+ },
+ {
+ .name = "rt_period_us",
+ .read_u64 = cpu_rt_period_read_uint,
+ .write_u64 = cpu_rt_period_write_uint,
+ },
+ { } /* Terminate */
+};
+
+# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED
+DEFINE_STATIC_KEY_FALSE(rt_group_sched);
+# else
+DEFINE_STATIC_KEY_TRUE(rt_group_sched);
+# endif
+
+static int __init setup_rt_group_sched(char *str)
+{
+ long val;
+
+ if (kstrtol(str, 0, &val) || val < 0 || val > 1) {
+ pr_warn("Unable to set rt_group_sched\n");
+ return 1;
+ }
+ if (val)
+ static_branch_enable(&rt_group_sched);
+ else
+ static_branch_disable(&rt_group_sched);
+
+ return 1;
+}
+__setup("rt_group_sched=", setup_rt_group_sched);
+
+static int __init cpu_rt_group_init(void)
+{
+ if (!rt_group_sched_enabled())
+ return 0;
+
+ WARN_ON(cgroup_add_legacy_cftypes(&cpu_cgrp_subsys, rt_group_files));
+ return 0;
+}
+subsys_initcall(cpu_rt_group_init);
+#endif /* CONFIG_RT_GROUP_SCHED */
+
static int cpu_extra_stat_show(struct seq_file *sf,
struct cgroup_subsys_state *css)
{
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 1a19d69b91ed..816f07f9d30f 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -81,9 +81,23 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
if (!cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
- if (unlikely(sg_policy->limits_changed)) {
- sg_policy->limits_changed = false;
- sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
+ if (unlikely(READ_ONCE(sg_policy->limits_changed))) {
+ WRITE_ONCE(sg_policy->limits_changed, false);
+ sg_policy->need_freq_update = true;
+
+ /*
+ * The above limits_changed update must occur before the reads
+ * of policy limits in cpufreq_driver_resolve_freq() or a policy
+ * limits update might be missed, so use a memory barrier to
+ * ensure it.
+ *
+ * This pairs with the write memory barrier in sugov_limits().
+ */
+ smp_mb();
+
+ return true;
+ } else if (sg_policy->need_freq_update) {
+ /* ignore_dl_rate_limit() wants a new frequency to be found. */
return true;
}
@@ -95,10 +109,22 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
- if (sg_policy->need_freq_update)
+ if (sg_policy->need_freq_update) {
sg_policy->need_freq_update = false;
- else if (sg_policy->next_freq == next_freq)
+ /*
+ * The policy limits have changed, but if the return value of
+ * cpufreq_driver_resolve_freq() after applying the new limits
+ * is still equal to the previously selected frequency, the
+ * driver callback need not be invoked unless the driver
+ * specifically wants that to happen on every update of the
+ * policy limits.
+ */
+ if (sg_policy->next_freq == next_freq &&
+ !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS))
+ return false;
+ } else if (sg_policy->next_freq == next_freq) {
return false;
+ }
sg_policy->next_freq = next_freq;
sg_policy->last_freq_update_time = time;
@@ -365,7 +391,7 @@ static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; }
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
{
if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min)
- sg_cpu->sg_policy->limits_changed = true;
+ sg_cpu->sg_policy->need_freq_update = true;
}
static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
@@ -871,7 +897,16 @@ static void sugov_limits(struct cpufreq_policy *policy)
mutex_unlock(&sg_policy->work_lock);
}
- sg_policy->limits_changed = true;
+ /*
+ * The limits_changed update below must take place before the updates
+ * of policy limits in cpufreq_set_policy() or a policy limits update
+ * might be missed, so use a memory barrier to ensure it.
+ *
+ * This pairs with the memory barrier in sugov_should_update_freq().
+ */
+ smp_wmb();
+
+ WRITE_ONCE(sg_policy->limits_changed, true);
}
struct cpufreq_governor schedutil_gov = {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 56ae54e0ce6a..557246880a7e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -588,6 +588,10 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent)
debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops);
debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops);
debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level);
+
+ if (sd->flags & SD_ASYM_PACKING)
+ debugfs_create_u32("group_asym_prefer_cpu", 0444, parent,
+ (u32 *)&sd->groups->asym_prefer_cpu);
}
void update_sched_domain_debugfs(void)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 66bcd40a28ca..f5133249fd4d 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -163,7 +163,7 @@ enum scx_ops_flags {
/*
* CPU cgroup support flags
*/
- SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */
+ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
SCX_OPS_ENQ_LAST |
@@ -1118,8 +1118,38 @@ static void scx_kf_disallow(u32 mask)
current->scx.kf_mask &= ~mask;
}
-#define SCX_CALL_OP(mask, op, args...) \
+/*
+ * Track the rq currently locked.
+ *
+ * This allows kfuncs to safely operate on rq from any scx ops callback,
+ * knowing which rq is already locked.
+ */
+static DEFINE_PER_CPU(struct rq *, locked_rq);
+
+static inline void update_locked_rq(struct rq *rq)
+{
+ /*
+ * Check whether @rq is actually locked. This can help expose bugs
+ * or incorrect assumptions about the context in which a kfunc or
+ * callback is executed.
+ */
+ if (rq)
+ lockdep_assert_rq_held(rq);
+ __this_cpu_write(locked_rq, rq);
+}
+
+/*
+ * Return the rq currently locked from an scx callback, or NULL if no rq is
+ * locked.
+ */
+static inline struct rq *scx_locked_rq(void)
+{
+ return __this_cpu_read(locked_rq);
+}
+
+#define SCX_CALL_OP(mask, op, rq, args...) \
do { \
+ update_locked_rq(rq); \
if (mask) { \
scx_kf_allow(mask); \
scx_ops.op(args); \
@@ -1127,11 +1157,14 @@ do { \
} else { \
scx_ops.op(args); \
} \
+ update_locked_rq(NULL); \
} while (0)
-#define SCX_CALL_OP_RET(mask, op, args...) \
+#define SCX_CALL_OP_RET(mask, op, rq, args...) \
({ \
__typeof__(scx_ops.op(args)) __ret; \
+ \
+ update_locked_rq(rq); \
if (mask) { \
scx_kf_allow(mask); \
__ret = scx_ops.op(args); \
@@ -1139,6 +1172,7 @@ do { \
} else { \
__ret = scx_ops.op(args); \
} \
+ update_locked_rq(NULL); \
__ret; \
})
@@ -1153,31 +1187,31 @@ do { \
* scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
* the specific task.
*/
-#define SCX_CALL_OP_TASK(mask, op, task, args...) \
+#define SCX_CALL_OP_TASK(mask, op, rq, task, args...) \
do { \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task; \
- SCX_CALL_OP(mask, op, task, ##args); \
+ SCX_CALL_OP(mask, op, rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
} while (0)
-#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \
+#define SCX_CALL_OP_TASK_RET(mask, op, rq, task, args...) \
({ \
__typeof__(scx_ops.op(task, ##args)) __ret; \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task; \
- __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \
+ __ret = SCX_CALL_OP_RET(mask, op, rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
__ret; \
})
-#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \
+#define SCX_CALL_OP_2TASKS_RET(mask, op, rq, task0, task1, args...) \
({ \
__typeof__(scx_ops.op(task0, task1, ##args)) __ret; \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task0; \
current->scx.kf_tasks[1] = task1; \
- __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \
+ __ret = SCX_CALL_OP_RET(mask, op, rq, task0, task1, ##args); \
current->scx.kf_tasks[0] = NULL; \
current->scx.kf_tasks[1] = NULL; \
__ret; \
@@ -2172,7 +2206,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
WARN_ON_ONCE(*ddsp_taskp);
*ddsp_taskp = p;
- SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
+ SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags);
*ddsp_taskp = NULL;
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -2269,7 +2303,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
add_nr_running(rq, 1);
if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p))
- SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
+ SCX_CALL_OP_TASK(SCX_KF_REST, runnable, rq, p, enq_flags);
if (enq_flags & SCX_ENQ_WAKEUP)
touch_core_sched(rq, p);
@@ -2283,7 +2317,7 @@ out:
__scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
}
-static void ops_dequeue(struct task_struct *p, u64 deq_flags)
+static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
{
unsigned long opss;
@@ -2304,7 +2338,7 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
BUG();
case SCX_OPSS_QUEUED:
if (SCX_HAS_OP(dequeue))
- SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags);
+ SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, rq, p, deq_flags);
if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
SCX_OPSS_NONE))
@@ -2337,7 +2371,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
return true;
}
- ops_dequeue(p, deq_flags);
+ ops_dequeue(rq, p, deq_flags);
/*
* A currently running task which is going off @rq first gets dequeued
@@ -2353,11 +2387,11 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
*/
if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
update_curr_scx(rq);
- SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false);
+ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, false);
}
if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p))
- SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags);
+ SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, rq, p, deq_flags);
if (deq_flags & SCX_DEQ_SLEEP)
p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
@@ -2377,7 +2411,7 @@ static void yield_task_scx(struct rq *rq)
struct task_struct *p = rq->curr;
if (SCX_HAS_OP(yield))
- SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL);
+ SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, p, NULL);
else
p->scx.slice = 0;
}
@@ -2387,7 +2421,7 @@ static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
struct task_struct *from = rq->curr;
if (SCX_HAS_OP(yield))
- return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to);
+ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, from, to);
else
return false;
}
@@ -2945,7 +2979,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* emitted in switch_class().
*/
if (SCX_HAS_OP(cpu_acquire))
- SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL);
+ SCX_CALL_OP(SCX_KF_REST, cpu_acquire, rq, cpu_of(rq), NULL);
rq->scx.cpu_released = false;
}
@@ -2990,7 +3024,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
do {
dspc->nr_tasks = 0;
- SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
+ SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, rq, cpu_of(rq),
prev_on_scx ? prev : NULL);
flush_dispatch_buf(rq);
@@ -3104,7 +3138,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
* Core-sched might decide to execute @p before it is
* dispatched. Call ops_dequeue() to notify the BPF scheduler.
*/
- ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC);
+ ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC);
dispatch_dequeue(rq, p);
}
@@ -3112,7 +3146,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
/* see dequeue_task_scx() on why we skip when !QUEUED */
if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
- SCX_CALL_OP_TASK(SCX_KF_REST, running, p);
+ SCX_CALL_OP_TASK(SCX_KF_REST, running, rq, p);
clr_task_runnable(p, true);
@@ -3193,8 +3227,7 @@ static void switch_class(struct rq *rq, struct task_struct *next)
.task = next,
};
- SCX_CALL_OP(SCX_KF_CPU_RELEASE,
- cpu_release, cpu_of(rq), &args);
+ SCX_CALL_OP(SCX_KF_CPU_RELEASE, cpu_release, rq, cpu_of(rq), &args);
}
rq->scx.cpu_released = true;
}
@@ -3207,7 +3240,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
/* see dequeue_task_scx() on why we skip when !QUEUED */
if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
- SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
+ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, true);
if (p->scx.flags & SCX_TASK_QUEUED) {
set_task_runnable(rq, p);
@@ -3348,7 +3381,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
* verifier.
*/
if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
- return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
+ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, NULL,
(struct task_struct *)a,
(struct task_struct *)b);
else
@@ -3385,7 +3418,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
*ddsp_taskp = p;
cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
- select_cpu, p, prev_cpu, wake_flags);
+ select_cpu, NULL, p, prev_cpu, wake_flags);
p->scx.selected_cpu = cpu;
*ddsp_taskp = NULL;
if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
@@ -3430,8 +3463,8 @@ static void set_cpus_allowed_scx(struct task_struct *p,
* designation pointless. Cast it away when calling the operation.
*/
if (SCX_HAS_OP(set_cpumask))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
- (struct cpumask *)p->cpus_ptr);
+ SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, NULL,
+ p, (struct cpumask *)p->cpus_ptr);
}
static void handle_hotplug(struct rq *rq, bool online)
@@ -3444,9 +3477,9 @@ static void handle_hotplug(struct rq *rq, bool online)
scx_idle_update_selcpu_topology(&scx_ops);
if (online && SCX_HAS_OP(cpu_online))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, NULL, cpu);
else if (!online && SCX_HAS_OP(cpu_offline))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, NULL, cpu);
else
scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
"cpu %d going %s, exiting scheduler", cpu,
@@ -3550,7 +3583,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
curr->scx.slice = 0;
touch_core_sched(rq, curr);
} else if (SCX_HAS_OP(tick)) {
- SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr);
+ SCX_CALL_OP_TASK(SCX_KF_REST, tick, rq, curr);
}
if (!curr->scx.slice)
@@ -3627,7 +3660,7 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
.fork = fork,
};
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args);
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, NULL, p, &args);
if (unlikely(ret)) {
ret = ops_sanitize_err("init_task", ret);
return ret;
@@ -3668,9 +3701,10 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
static void scx_ops_enable_task(struct task_struct *p)
{
+ struct rq *rq = task_rq(p);
u32 weight;
- lockdep_assert_rq_held(task_rq(p));
+ lockdep_assert_rq_held(rq);
/*
* Set the weight before calling ops.enable() so that the scheduler
@@ -3684,20 +3718,22 @@ static void scx_ops_enable_task(struct task_struct *p)
p->scx.weight = sched_weight_to_cgroup(weight);
if (SCX_HAS_OP(enable))
- SCX_CALL_OP_TASK(SCX_KF_REST, enable, p);
+ SCX_CALL_OP_TASK(SCX_KF_REST, enable, rq, p);
scx_set_task_state(p, SCX_TASK_ENABLED);
if (SCX_HAS_OP(set_weight))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+ SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight);
}
static void scx_ops_disable_task(struct task_struct *p)
{
- lockdep_assert_rq_held(task_rq(p));
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_rq_held(rq);
WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
if (SCX_HAS_OP(disable))
- SCX_CALL_OP_TASK(SCX_KF_REST, disable, p);
+ SCX_CALL_OP_TASK(SCX_KF_REST, disable, rq, p);
scx_set_task_state(p, SCX_TASK_READY);
}
@@ -3726,7 +3762,7 @@ static void scx_ops_exit_task(struct task_struct *p)
}
if (SCX_HAS_OP(exit_task))
- SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args);
+ SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, task_rq(p), p, &args);
scx_set_task_state(p, SCX_TASK_NONE);
}
@@ -3835,7 +3871,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
if (SCX_HAS_OP(set_weight))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+ SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight);
}
static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
@@ -3851,8 +3887,8 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
* different scheduler class. Keep the BPF scheduler up-to-date.
*/
if (SCX_HAS_OP(set_cpumask))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
- (struct cpumask *)p->cpus_ptr);
+ SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, rq,
+ p, (struct cpumask *)p->cpus_ptr);
}
static void switched_from_scx(struct rq *rq, struct task_struct *p)
@@ -3899,35 +3935,6 @@ bool scx_can_stop_tick(struct rq *rq)
DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
static bool scx_cgroup_enabled;
-static bool cgroup_warned_missing_weight;
-static bool cgroup_warned_missing_idle;
-
-static void scx_cgroup_warn_missing_weight(struct task_group *tg)
-{
- if (scx_ops_enable_state() == SCX_OPS_DISABLED ||
- cgroup_warned_missing_weight)
- return;
-
- if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent)
- return;
-
- pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n",
- scx_ops.name);
- cgroup_warned_missing_weight = true;
-}
-
-static void scx_cgroup_warn_missing_idle(struct task_group *tg)
-{
- if (!scx_cgroup_enabled || cgroup_warned_missing_idle)
- return;
-
- if (!tg->idle)
- return;
-
- pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n",
- scx_ops.name);
- cgroup_warned_missing_idle = true;
-}
int scx_tg_online(struct task_group *tg)
{
@@ -3937,14 +3944,12 @@ int scx_tg_online(struct task_group *tg)
percpu_down_read(&scx_cgroup_rwsem);
- scx_cgroup_warn_missing_weight(tg);
-
if (scx_cgroup_enabled) {
if (SCX_HAS_OP(cgroup_init)) {
struct scx_cgroup_init_args args =
{ .weight = tg->scx_weight };
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, NULL,
tg->css.cgroup, &args);
if (ret)
ret = ops_sanitize_err("cgroup_init", ret);
@@ -3966,7 +3971,7 @@ void scx_tg_offline(struct task_group *tg)
percpu_down_read(&scx_cgroup_rwsem);
if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup);
tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
percpu_up_read(&scx_cgroup_rwsem);
@@ -3999,7 +4004,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
continue;
if (SCX_HAS_OP(cgroup_prep_move)) {
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move,
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, NULL,
p, from, css->cgroup);
if (ret)
goto err;
@@ -4013,8 +4018,8 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
err:
cgroup_taskset_for_each(p, css, tset) {
if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
- p->scx.cgrp_moving_from, css->cgroup);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+ p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
@@ -4032,8 +4037,8 @@ void scx_cgroup_move_task(struct task_struct *p)
* cgrp_moving_from set.
*/
if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
- SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
- p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
+ SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, NULL,
+ p, p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
p->scx.cgrp_moving_from = NULL;
}
@@ -4052,8 +4057,8 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
cgroup_taskset_for_each(p, css, tset) {
if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
- p->scx.cgrp_moving_from, css->cgroup);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+ p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
out_unlock:
@@ -4066,7 +4071,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
if (scx_cgroup_enabled && tg->scx_weight != weight) {
if (SCX_HAS_OP(cgroup_set_weight))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight,
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
tg_cgrp(tg), weight);
tg->scx_weight = weight;
}
@@ -4076,9 +4081,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
void scx_group_set_idle(struct task_group *tg, bool idle)
{
- percpu_down_read(&scx_cgroup_rwsem);
- scx_cgroup_warn_missing_idle(tg);
- percpu_up_read(&scx_cgroup_rwsem);
+ /* TODO: Implement ops->cgroup_set_idle() */
}
static void scx_cgroup_lock(void)
@@ -4257,7 +4260,7 @@ static void scx_cgroup_exit(void)
continue;
rcu_read_unlock();
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, css->cgroup);
rcu_read_lock();
css_put(css);
@@ -4272,9 +4275,6 @@ static int scx_cgroup_init(void)
percpu_rwsem_assert_held(&scx_cgroup_rwsem);
- cgroup_warned_missing_weight = false;
- cgroup_warned_missing_idle = false;
-
/*
* scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
* cgroups and init, all online cgroups are initialized.
@@ -4284,9 +4284,6 @@ static int scx_cgroup_init(void)
struct task_group *tg = css_tg(css);
struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
- scx_cgroup_warn_missing_weight(tg);
- scx_cgroup_warn_missing_idle(tg);
-
if ((tg->scx_flags &
(SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
continue;
@@ -4300,7 +4297,7 @@ static int scx_cgroup_init(void)
continue;
rcu_read_unlock();
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, NULL,
css->cgroup, &args);
if (ret) {
css_put(css);
@@ -4623,7 +4620,7 @@ unlock:
static void free_exit_info(struct scx_exit_info *ei)
{
- kfree(ei->dump);
+ kvfree(ei->dump);
kfree(ei->msg);
kfree(ei->bt);
kfree(ei);
@@ -4639,7 +4636,7 @@ static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL);
ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
- ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);
+ ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL);
if (!ei->bt || !ei->msg || !ei->dump) {
free_exit_info(ei);
@@ -4797,7 +4794,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
}
if (scx_ops.exit)
- SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei);
cancel_delayed_work_sync(&scx_watchdog_work);
@@ -5004,7 +5001,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
if (SCX_HAS_OP(dump_task)) {
ops_dump_init(s, " ");
- SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p);
+ SCX_CALL_OP(SCX_KF_REST, dump_task, NULL, dctx, p);
ops_dump_exit();
}
@@ -5051,7 +5048,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
if (SCX_HAS_OP(dump)) {
ops_dump_init(&s, "");
- SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, dump, NULL, &dctx);
ops_dump_exit();
}
@@ -5108,7 +5105,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
used = seq_buf_used(&ns);
if (SCX_HAS_OP(dump_cpu)) {
ops_dump_init(&ns, " ");
- SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle);
+ SCX_CALL_OP(SCX_KF_REST, dump_cpu, NULL, &dctx, cpu, idle);
ops_dump_exit();
}
@@ -5252,6 +5249,9 @@ static int validate_ops(const struct sched_ext_ops *ops)
return -EINVAL;
}
+ if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT)
+ pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n");
+
return 0;
}
@@ -5364,7 +5364,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_idle_enable(ops);
if (scx_ops.init) {
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init);
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL);
if (ret) {
ret = ops_sanitize_err("init", ret);
cpus_read_unlock();
@@ -6827,6 +6827,12 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
__alignof__(struct bpf_iter_scx_dsq));
+ /*
+ * next() and destroy() will be called regardless of the return value.
+ * Always clear $kit->dsq.
+ */
+ kit->dsq = NULL;
+
if (flags & ~__SCX_DSQ_ITER_USER_FLAGS)
return -EINVAL;
@@ -7113,13 +7119,32 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
}
if (ops_cpu_valid(cpu, NULL)) {
- struct rq *rq = cpu_rq(cpu);
+ struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq();
+ struct rq_flags rf;
+
+ /*
+ * When called with an rq lock held, restrict the operation
+ * to the corresponding CPU to prevent ABBA deadlocks.
+ */
+ if (locked_rq && rq != locked_rq) {
+ scx_ops_error("Invalid target CPU %d", cpu);
+ return;
+ }
+
+ /*
+ * If no rq lock is held, allow to operate on any CPU by
+ * acquiring the corresponding rq lock.
+ */
+ if (!locked_rq) {
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+ }
rq->scx.cpuperf_target = perf;
+ cpufreq_update_util(rq, 0);
- rcu_read_lock_sched_notrace();
- cpufreq_update_util(cpu_rq(cpu), 0);
- rcu_read_unlock_sched_notrace();
+ if (!locked_rq)
+ rq_unlock_irqrestore(rq, &rf);
}
}
@@ -7350,12 +7375,6 @@ BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
-BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
-BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
-BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
-BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
-BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index cb343ca889e0..e67a19a071c1 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -674,7 +674,7 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
* managed by put_prev_task_idle()/set_next_task_idle().
*/
if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
- SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+ SCX_CALL_OP(SCX_KF_REST, update_idle, rq, cpu_of(rq), idle);
/*
* Update the idle masks:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e43993a4e580..125912c0e9dd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3795,6 +3795,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
update_entity_lag(cfs_rq, se);
se->deadline -= se->vruntime;
se->rel_deadline = 1;
+ cfs_rq->nr_queued--;
if (!curr)
__dequeue_entity(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
@@ -3821,10 +3822,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
- update_load_add(&cfs_rq->load, se->load.weight);
place_entity(cfs_rq, se, 0);
+ update_load_add(&cfs_rq->load, se->load.weight);
if (!curr)
__enqueue_entity(cfs_rq, se);
+ cfs_rq->nr_queued++;
/*
* The entity's vruntime has been adjusted, so let's check
@@ -4933,13 +4935,6 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
goto done;
/*
- * To avoid overestimation of actual task utilization, skip updates if
- * we cannot grant there is idle time in this CPU.
- */
- if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
- return;
-
- /*
* To avoid underestimate of task utilization, skip updates of EWMA if
* we cannot grant that thread got all CPU time it wanted.
*/
@@ -6941,7 +6936,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* Let's add the task's estimated utilization to the cfs_rq's
* estimated utilization, before we update schedutil.
*/
- if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
+ if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED))
util_est_enqueue(&rq->cfs, p);
if (flags & ENQUEUE_DELAYED) {
@@ -7081,9 +7076,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
h_nr_idle = task_has_idle_policy(p);
if (task_sleep || task_delayed || !se->sched_delayed)
h_nr_runnable = 1;
- } else {
- cfs_rq = group_cfs_rq(se);
- slice = cfs_rq_min_slice(cfs_rq);
}
for_each_sched_entity(se) {
@@ -7093,6 +7085,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (p && &p->se == se)
return -1;
+ slice = cfs_rq_min_slice(cfs_rq);
break;
}
@@ -7183,7 +7176,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
*/
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
- if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
+ if (!p->se.sched_delayed)
util_est_dequeue(&rq->cfs, p);
util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
@@ -7198,6 +7191,11 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
return true;
}
+static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
+{
+ return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable);
+}
+
#ifdef CONFIG_SMP
/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
@@ -7359,8 +7357,12 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
- if (sync && cpu_rq(this_cpu)->nr_running == 1)
- return this_cpu;
+ if (sync) {
+ struct rq *rq = cpu_rq(this_cpu);
+
+ if ((rq->nr_running - cfs_h_nr_delayed(rq)) == 1)
+ return this_cpu;
+ }
if (available_idle_cpu(prev_cpu))
return prev_cpu;
@@ -10258,7 +10260,7 @@ sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group
(sgs->group_weight - sgs->idle_cpus != 1))
return false;
- return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu);
+ return sched_asym(env->sd, env->dst_cpu, READ_ONCE(group->asym_prefer_cpu));
}
/* One group has more than one SMT CPU while the other group does not */
@@ -10495,7 +10497,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
case group_asym_packing:
/* Prefer to move from lowest priority CPU's work */
- return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu);
+ return sched_asym_prefer(READ_ONCE(sds->busiest->asym_prefer_cpu),
+ READ_ONCE(sg->asym_prefer_cpu));
case group_misfit_task:
/*
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 81bc8b329ef1..93b038d48900 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -40,7 +40,7 @@ int housekeeping_any_cpu(enum hk_type type)
if (cpu < nr_cpu_ids)
return cpu;
- cpu = cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask);
+ cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask);
if (likely(cpu < nr_cpu_ids))
return cpu;
/*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index fa03ec3ed56a..e40422c37033 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -89,6 +89,7 @@ void init_rt_rq(struct rt_rq *rt_rq)
rt_rq->rt_throttled = 0;
rt_rq->rt_runtime = 0;
raw_spin_lock_init(&rt_rq->rt_runtime_lock);
+ rt_rq->tg = &root_task_group;
#endif
}
@@ -175,11 +176,14 @@ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
+ /* Cannot fold with non-CONFIG_RT_GROUP_SCHED version, layout */
+ WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group);
return rt_rq->rq;
}
static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
{
+ WARN_ON(!rt_group_sched_enabled() && rt_se->rt_rq->tg != &root_task_group);
return rt_se->rt_rq;
}
@@ -187,11 +191,15 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
{
struct rt_rq *rt_rq = rt_se->rt_rq;
+ WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group);
return rt_rq->rq;
}
void unregister_rt_sched_group(struct task_group *tg)
{
+ if (!rt_group_sched_enabled())
+ return;
+
if (tg->rt_se)
destroy_rt_bandwidth(&tg->rt_bandwidth);
}
@@ -200,6 +208,9 @@ void free_rt_sched_group(struct task_group *tg)
{
int i;
+ if (!rt_group_sched_enabled())
+ return;
+
for_each_possible_cpu(i) {
if (tg->rt_rq)
kfree(tg->rt_rq[i]);
@@ -244,6 +255,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
struct sched_rt_entity *rt_se;
int i;
+ if (!rt_group_sched_enabled())
+ return 1;
+
tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
if (!tg->rt_rq)
goto err;
@@ -482,9 +496,6 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{
- if (!rt_rq->tg)
- return RUNTIME_INF;
-
return rt_rq->rt_runtime;
}
@@ -497,6 +508,11 @@ typedef struct task_group *rt_rq_iter_t;
static inline struct task_group *next_task_group(struct task_group *tg)
{
+ if (!rt_group_sched_enabled()) {
+ WARN_ON(tg != &root_task_group);
+ return NULL;
+ }
+
do {
tg = list_entry_rcu(tg->list.next,
typeof(struct task_group), list);
@@ -509,9 +525,9 @@ static inline struct task_group *next_task_group(struct task_group *tg)
}
#define for_each_rt_rq(rt_rq, iter, rq) \
- for (iter = container_of(&task_groups, typeof(*iter), list); \
- (iter = next_task_group(iter)) && \
- (rt_rq = iter->rt_rq[cpu_of(rq)]);)
+ for (iter = &root_task_group; \
+ iter && (rt_rq = iter->rt_rq[cpu_of(rq)]); \
+ iter = next_task_group(iter))
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = rt_se->parent)
@@ -1066,13 +1082,12 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
-#ifdef CONFIG_RT_GROUP_SCHED
/*
* Change rq's cpupri only if rt_rq is the top queue.
*/
- if (&rq->rt != rt_rq)
+ if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq)
return;
-#endif
+
if (rq->online && prio < prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
}
@@ -1082,13 +1097,12 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
-#ifdef CONFIG_RT_GROUP_SCHED
/*
* Change rq's cpupri only if rt_rq is the top queue.
*/
- if (&rq->rt != rt_rq)
+ if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq)
return;
-#endif
+
if (rq->online && rt_rq->highest_prio.curr != prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
@@ -1156,8 +1170,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
if (rt_se_boosted(rt_se))
rt_rq->rt_nr_boosted++;
- if (rt_rq->tg)
- start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
+ start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
}
static void
@@ -1257,11 +1270,9 @@ static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_arr
static inline struct sched_statistics *
__schedstats_from_rt_se(struct sched_rt_entity *rt_se)
{
-#ifdef CONFIG_RT_GROUP_SCHED
/* schedstats is not supported for rt group. */
if (!rt_entity_is_task(rt_se))
return NULL;
-#endif
return &rt_task_of(rt_se)->stats;
}
@@ -1883,6 +1894,27 @@ static int find_lowest_rq(struct task_struct *task)
return -1;
}
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_tasks(rq))
+ return NULL;
+
+ p = plist_first_entry(&rq->rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+ BUG_ON(rq->cpu != task_cpu(p));
+ BUG_ON(task_current(rq, p));
+ BUG_ON(task_current_donor(rq, p));
+ BUG_ON(p->nr_cpus_allowed <= 1);
+
+ BUG_ON(!task_on_rq_queued(p));
+ BUG_ON(!rt_task(p));
+
+ return p;
+}
+
/* Will lock the rq it finds */
static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
{
@@ -1913,18 +1945,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
/*
* We had to unlock the run queue. In
* the mean time, task could have
- * migrated already or had its affinity changed.
- * Also make sure that it wasn't scheduled on its rq.
+ * migrated already or had its affinity changed,
+ * therefore check if the task is still at the
+ * head of the pushable tasks list.
* It is possible the task was scheduled, set
* "migrate_disabled" and then got preempted, so we must
* check the task migration disable flag here too.
*/
- if (unlikely(task_rq(task) != rq ||
+ if (unlikely(is_migration_disabled(task) ||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
- task_on_cpu(rq, task) ||
- !rt_task(task) ||
- is_migration_disabled(task) ||
- !task_on_rq_queued(task))) {
+ task != pick_next_pushable_task(rq))) {
double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1944,27 +1974,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
return lowest_rq;
}
-static struct task_struct *pick_next_pushable_task(struct rq *rq)
-{
- struct task_struct *p;
-
- if (!has_pushable_tasks(rq))
- return NULL;
-
- p = plist_first_entry(&rq->rt.pushable_tasks,
- struct task_struct, pushable_tasks);
-
- BUG_ON(rq->cpu != task_cpu(p));
- BUG_ON(task_current(rq, p));
- BUG_ON(task_current_donor(rq, p));
- BUG_ON(p->nr_cpus_allowed <= 1);
-
- BUG_ON(!task_on_rq_queued(p));
- BUG_ON(!rt_task(p));
-
- return p;
-}
-
/*
* If the current CPU has more than one RT task, see if the non
* running task can migrate over to a CPU that is running a task
@@ -2602,8 +2611,9 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
{
struct rt_rq *rt_rq;
-#ifdef CONFIG_RT_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED // XXX maybe add task_rt_rq(), see also sched_rt_period_rt_rq
rt_rq = task_group(p)->rt_rq[cpu];
+ WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group);
#else
rt_rq = &cpu_rq(cpu)->rt;
#endif
@@ -2713,6 +2723,9 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
return -EBUSY;
+ if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group))
+ return -EBUSY;
+
total = to_ratio(period, runtime);
/*
@@ -2868,7 +2881,7 @@ static int sched_rt_global_constraints(void)
int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{
/* Don't accept real-time tasks when there is no way for them to run */
- if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+ if (rt_group_sched_enabled() && rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
return 0;
return 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 47972f34ea70..c5a6a503eb6d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -813,15 +813,17 @@ struct rt_rq {
#ifdef CONFIG_RT_GROUP_SCHED
int rt_throttled;
- u64 rt_time;
- u64 rt_runtime;
+ u64 rt_time; /* consumed RT time, goes up in update_curr_rt */
+ u64 rt_runtime; /* allotted RT time, "slice" from rt_bandwidth, RT sharing/balancing */
/* Nests inside the rq lock: */
raw_spinlock_t rt_runtime_lock;
unsigned int rt_nr_boosted;
- struct rq *rq;
- struct task_group *tg;
+ struct rq *rq; /* this is always top-level rq, cache? */
+#endif
+#ifdef CONFIG_CGROUP_SCHED
+ struct task_group *tg; /* this tg has "this" rt_rq on given CPU for runnable entities */
#endif
};
@@ -1498,6 +1500,23 @@ static inline bool sched_group_cookie_match(struct rq *rq,
}
#endif /* !CONFIG_SCHED_CORE */
+#ifdef CONFIG_RT_GROUP_SCHED
+# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED
+DECLARE_STATIC_KEY_FALSE(rt_group_sched);
+static inline bool rt_group_sched_enabled(void)
+{
+ return static_branch_unlikely(&rt_group_sched);
+}
+# else
+DECLARE_STATIC_KEY_TRUE(rt_group_sched);
+static inline bool rt_group_sched_enabled(void)
+{
+ return static_branch_likely(&rt_group_sched);
+}
+# endif /* CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */
+#else
+# define rt_group_sched_enabled() false
+#endif /* CONFIG_RT_GROUP_SCHED */
static inline void lockdep_assert_rq_held(struct rq *rq)
{
@@ -2146,6 +2165,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#endif
#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * p->rt.rt_rq is NULL initially and it is easier to assign
+ * root_task_group's rt_rq than switching in rt_rq_of_se()
+ * Clobbers tg(!)
+ */
+ if (!rt_group_sched_enabled())
+ tg = &root_task_group;
p->rt.rt_rq = tg->rt_rq[cpu];
p->rt.parent = tg->rt_se[cpu];
#endif
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index c326de1344fb..547c1f05b667 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -634,13 +634,14 @@ change:
* Do not allow real-time tasks into groups that have no runtime
* assigned.
*/
- if (rt_bandwidth_enabled() && rt_policy(policy) &&
+ if (rt_group_sched_enabled() &&
+ rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
retval = -EPERM;
goto unlock;
}
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_SMP
if (dl_bandwidth_enabled() && dl_policy(policy) &&
!(attr->sched_flags & SCHED_FLAG_SUGOV)) {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f1ebc60d967f..a2a38e1b6f18 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1333,6 +1333,64 @@ next:
update_group_capacity(sd, cpu);
}
+#ifdef CONFIG_SMP
+
+/* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */
+void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
+{
+ int asym_prefer_cpu = cpu;
+ struct sched_domain *sd;
+
+ guard(rcu)();
+
+ for_each_domain(cpu, sd) {
+ struct sched_group *sg;
+ int group_cpu;
+
+ if (!(sd->flags & SD_ASYM_PACKING))
+ continue;
+
+ /*
+ * Groups of overlapping domain are replicated per NUMA
+ * node and will require updating "asym_prefer_cpu" on
+ * each local copy.
+ *
+ * If you are hitting this warning, consider moving
+ * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu"
+ * which is shared by all the overlapping groups.
+ */
+ WARN_ON_ONCE(sd->flags & SD_OVERLAP);
+
+ sg = sd->groups;
+ if (cpu != sg->asym_prefer_cpu) {
+ /*
+ * Since the parent is a superset of the current group,
+ * if the cpu is not the "asym_prefer_cpu" at the
+ * current level, it cannot be the preferred CPU at a
+ * higher levels either.
+ */
+ if (!sched_asym_prefer(cpu, sg->asym_prefer_cpu))
+ return;
+
+ WRITE_ONCE(sg->asym_prefer_cpu, cpu);
+ continue;
+ }
+
+ /* Ranking has improved; CPU is still the preferred one. */
+ if (new_prio >= old_prio)
+ continue;
+
+ for_each_cpu(group_cpu, sched_group_span(sg)) {
+ if (sched_asym_prefer(group_cpu, asym_prefer_cpu))
+ asym_prefer_cpu = group_cpu;
+ }
+
+ WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu);
+ }
+}
+
+#endif /* CONFIG_SMP */
+
/*
* Set of available CPUs grouped by their corresponding capacities
* Each list entry contains a CPU mask reflecting CPUs that share the same
@@ -2098,7 +2156,7 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
for (i = 0; i < sched_domains_numa_levels; i++) {
if (!masks[i][j])
break;
- cpu = cpumask_any_and(cpus, masks[i][j]);
+ cpu = cpumask_any_and_distribute(cpus, masks[i][j]);
if (cpu < nr_cpu_ids) {
found = cpu;
break;
@@ -2347,35 +2405,54 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
/*
* Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
- * any two given CPUs at this (non-NUMA) topology level.
+ * any two given CPUs on non-NUMA topology levels.
*/
-static bool topology_span_sane(struct sched_domain_topology_level *tl,
- const struct cpumask *cpu_map, int cpu)
+static bool topology_span_sane(const struct cpumask *cpu_map)
{
- int i = cpu + 1;
+ struct sched_domain_topology_level *tl;
+ struct cpumask *covered, *id_seen;
+ int cpu;
- /* NUMA levels are allowed to overlap */
- if (tl->flags & SDTL_OVERLAP)
- return true;
+ lockdep_assert_held(&sched_domains_mutex);
+ covered = sched_domains_tmpmask;
+ id_seen = sched_domains_tmpmask2;
+
+ for_each_sd_topology(tl) {
+
+ /* NUMA levels are allowed to overlap */
+ if (tl->flags & SDTL_OVERLAP)
+ continue;
+
+ cpumask_clear(covered);
+ cpumask_clear(id_seen);
- /*
- * Non-NUMA levels cannot partially overlap - they must be either
- * completely equal or completely disjoint. Otherwise we can end up
- * breaking the sched_group lists - i.e. a later get_group() pass
- * breaks the linking done for an earlier span.
- */
- for_each_cpu_from(i, cpu_map) {
/*
- * We should 'and' all those masks with 'cpu_map' to exactly
- * match the topology we're about to build, but that can only
- * remove CPUs, which only lessens our ability to detect
- * overlaps
+ * Non-NUMA levels cannot partially overlap - they must be either
+ * completely equal or completely disjoint. Otherwise we can end up
+ * breaking the sched_group lists - i.e. a later get_group() pass
+ * breaks the linking done for an earlier span.
*/
- if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
- cpumask_intersects(tl->mask(cpu), tl->mask(i)))
- return false;
- }
+ for_each_cpu(cpu, cpu_map) {
+ const struct cpumask *tl_cpu_mask = tl->mask(cpu);
+ int id;
+
+ /* lowest bit set in this mask is used as a unique id */
+ id = cpumask_first(tl_cpu_mask);
+
+ if (cpumask_test_cpu(id, id_seen)) {
+ /* First CPU has already been seen, ensure identical spans */
+ if (!cpumask_equal(tl->mask(id), tl_cpu_mask))
+ return false;
+ } else {
+ /* First CPU hasn't been seen before, ensure it's a completely new span */
+ if (cpumask_intersects(tl_cpu_mask, covered))
+ return false;
+ cpumask_or(covered, covered, tl_cpu_mask);
+ cpumask_set_cpu(id, id_seen);
+ }
+ }
+ }
return true;
}
@@ -2408,9 +2485,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
sd = NULL;
for_each_sd_topology(tl) {
- if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
- goto error;
-
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
@@ -2424,6 +2498,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
}
}
+ if (WARN_ON(!topology_span_sane(cpu_map)))
+ goto error;
+
/* Build the groups for the domains */
for_each_cpu(i, cpu_map) {
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
diff --git a/kernel/sys.c b/kernel/sys.c
index c434968e9f5d..adc0de0aa364 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -52,6 +52,7 @@
#include <linux/user_namespace.h>
#include <linux/time_namespace.h>
#include <linux/binfmts.h>
+#include <linux/futex.h>
#include <linux/sched.h>
#include <linux/sched/autogroup.h>
@@ -2820,6 +2821,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = posixtimer_create_prctl(arg2);
break;
+ case PR_FUTEX_HASH:
+ error = futex_hash_prctl(arg2, arg3, arg4);
+ break;
default:
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
error = -EINVAL;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 517ee2590a29..30899a8cc52c 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -366,7 +366,7 @@ static const struct debug_obj_descr hrtimer_debug_descr;
static void *hrtimer_debug_hint(void *addr)
{
- return ((struct hrtimer *) addr)->function;
+ return ACCESS_PRIVATE((struct hrtimer *)addr, function);
}
/*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index a47bcf71defc..9a3859443c04 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -509,6 +509,7 @@ void tick_resume(void)
#ifdef CONFIG_SUSPEND
static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
+static DEFINE_WAIT_OVERRIDE_MAP(tick_freeze_map, LD_WAIT_SLEEP);
static unsigned int tick_freeze_depth;
/**
@@ -528,9 +529,22 @@ void tick_freeze(void)
if (tick_freeze_depth == num_online_cpus()) {
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), true);
+ /*
+ * All other CPUs have their interrupts disabled and are
+ * suspended to idle. Other tasks have been frozen so there
+ * is no scheduling happening. This means that there is no
+ * concurrency in the system at this point. Therefore it is
+ * okay to acquire a sleeping lock on PREEMPT_RT, such as a
+ * spinlock, because the lock cannot be held by other CPUs
+ * or threads and acquiring it cannot block.
+ *
+ * Inform lockdep about the situation.
+ */
+ lock_map_acquire_try(&tick_freeze_map);
system_state = SYSTEM_SUSPEND;
sched_clock_suspend();
timekeeping_suspend();
+ lock_map_release(&tick_freeze_map);
} else {
tick_suspend_local();
}
@@ -552,8 +566,16 @@ void tick_unfreeze(void)
raw_spin_lock(&tick_freeze_lock);
if (tick_freeze_depth == num_online_cpus()) {
+ /*
+ * Similar to tick_freeze(). On resumption the first CPU may
+ * acquire uncontended sleeping locks while other CPUs block on
+ * tick_freeze_lock.
+ */
+ lock_map_acquire_try(&tick_freeze_map);
timekeeping_resume();
sched_clock_resume();
+ lock_map_release(&tick_freeze_map);
+
system_state = SYSTEM_RUNNING;
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), false);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1e67d076f195..a009c91f7b05 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -164,10 +164,34 @@ static inline struct timespec64 tk_xtime(const struct timekeeper *tk)
return ts;
}
+static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk)
+{
+ struct timespec64 ts;
+
+ ts.tv_sec = tk->xtime_sec;
+ ts.tv_nsec = tk->coarse_nsec;
+ return ts;
+}
+
+/*
+ * Update the nanoseconds part for the coarse time keepers. They can't rely
+ * on xtime_nsec because xtime_nsec could be adjusted by a small negative
+ * amount when the multiplication factor of the clock is adjusted, which
+ * could cause the coarse clocks to go slightly backwards. See
+ * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse
+ * clockids which only is updated when the clock has been set or we have
+ * accumulated time.
+ */
+static inline void tk_update_coarse_nsecs(struct timekeeper *tk)
+{
+ tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+}
+
static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec = ts->tv_sec;
tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
+ tk_update_coarse_nsecs(tk);
}
static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
@@ -175,6 +199,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
tk->xtime_sec += ts->tv_sec;
tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
tk_normalize_xtime(tk);
+ tk_update_coarse_nsecs(tk);
}
static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
@@ -708,6 +733,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
tk_normalize_xtime(tk);
delta -= incr;
}
+ tk_update_coarse_nsecs(tk);
}
/**
@@ -804,8 +830,8 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset);
ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned int seq;
ktime_t base, *offset = offsets[offs];
+ unsigned int seq;
u64 nsecs;
WARN_ON(timekeeping_suspended);
@@ -813,7 +839,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
do {
seq = read_seqcount_begin(&tk_core.seq);
base = ktime_add(tk->tkr_mono.base, *offset);
- nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsecs = tk->coarse_nsec;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -2161,7 +2187,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
struct timekeeper *real_tk = &tk_core.timekeeper;
unsigned int clock_set = 0;
int shift = 0, maxshift;
- u64 offset;
+ u64 offset, orig_offset;
guard(raw_spinlock_irqsave)(&tk_core.lock);
@@ -2172,7 +2198,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
tk->tkr_mono.clock->max_raw_delta);
-
+ orig_offset = offset;
/* Check if there's really nothing to do */
if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
return false;
@@ -2205,6 +2231,14 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
*/
clock_set |= accumulate_nsecs_to_secs(tk);
+ /*
+ * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls
+ * making small negative adjustments to the base xtime_nsec
+ * value, only update the coarse clocks if we accumulated time
+ */
+ if (orig_offset != offset)
+ tk_update_coarse_nsecs(tk);
+
timekeeping_update_from_shadow(&tk_core, clock_set);
return !!clock_set;
@@ -2248,7 +2282,7 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
- *ts = tk_xtime(tk);
+ *ts = tk_xtime_coarse(tk);
} while (read_seqcount_retry(&tk_core.seq, seq));
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
@@ -2271,7 +2305,7 @@ void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
- *ts = tk_xtime(tk);
+ *ts = tk_xtime_coarse(tk);
offset = tk_core.timekeeper.offs_real;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -2350,12 +2384,12 @@ void ktime_get_coarse_ts64(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
- now = tk_xtime(tk);
+ now = tk_xtime_coarse(tk);
mono = tk->wall_to_monotonic;
} while (read_seqcount_retry(&tk_core.seq, seq));
set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
- now.tv_nsec + mono.tv_nsec);
+ now.tv_nsec + mono.tv_nsec);
}
EXPORT_SYMBOL(ktime_get_coarse_ts64);
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 01c2ab1e8971..32ef27c71b57 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -98,12 +98,12 @@ void update_vsyscall(struct timekeeper *tk)
/* CLOCK_REALTIME_COARSE */
vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
vdso_ts->sec = tk->xtime_sec;
- vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ vdso_ts->nsec = tk->coarse_nsec;
/* CLOCK_MONOTONIC_COARSE */
vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
- nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsec = tk->coarse_nsec;
nsec = nsec + tk->wall_to_monotonic.tv_nsec;
vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3679a6d18934..3f6a7bdc6edf 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -893,11 +893,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
rcu_read_unlock();
}
-static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio)
-{
- blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0);
-}
-
static void blk_add_trace_bio_complete(void *ignore,
struct request_queue *q, struct bio *bio)
{
@@ -1089,8 +1084,6 @@ static void blk_register_tracepoints(void)
WARN_ON(ret);
ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
- WARN_ON(ret);
ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
WARN_ON(ret);
ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
@@ -1125,7 +1118,6 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
- unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
@@ -1462,7 +1454,6 @@ static const struct {
[__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug },
[__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic },
[__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split },
- [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic },
[__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap },
};
@@ -1896,6 +1887,8 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
rwbs[i++] = 'S';
if (opf & REQ_META)
rwbs[i++] = 'M';
+ if (opf & REQ_ATOMIC)
+ rwbs[i++] = 'U';
rwbs[i] = '\0';
}
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 33082c4e8154..ba7ff14f5339 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -89,8 +89,11 @@ static bool delete_fprobe_node(struct fprobe_hlist_node *node)
{
lockdep_assert_held(&fprobe_mutex);
- WRITE_ONCE(node->fp, NULL);
- hlist_del_rcu(&node->hlist);
+ /* Avoid double deleting */
+ if (READ_ONCE(node->fp) != NULL) {
+ WRITE_ONCE(node->fp, NULL);
+ hlist_del_rcu(&node->hlist);
+ }
return !!find_first_fprobe_node(node->addr);
}
@@ -411,6 +414,103 @@ static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0);
}
+#ifdef CONFIG_MODULES
+
+#define FPROBE_IPS_BATCH_INIT 8
+/* instruction pointer address list */
+struct fprobe_addr_list {
+ int index;
+ int size;
+ unsigned long *addrs;
+};
+
+static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long addr)
+{
+ unsigned long *addrs;
+
+ if (alist->index >= alist->size)
+ return -ENOMEM;
+
+ alist->addrs[alist->index++] = addr;
+ if (alist->index < alist->size)
+ return 0;
+
+ /* Expand the address list */
+ addrs = kcalloc(alist->size * 2, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return -ENOMEM;
+
+ memcpy(addrs, alist->addrs, alist->size * sizeof(*addrs));
+ alist->size *= 2;
+ kfree(alist->addrs);
+ alist->addrs = addrs;
+
+ return 0;
+}
+
+static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head *head,
+ struct fprobe_addr_list *alist)
+{
+ struct fprobe_hlist_node *node;
+ int ret = 0;
+
+ hlist_for_each_entry_rcu(node, head, hlist,
+ lockdep_is_held(&fprobe_mutex)) {
+ if (!within_module(node->addr, mod))
+ continue;
+ if (delete_fprobe_node(node))
+ continue;
+ /*
+ * If failed to update alist, just continue to update hlist.
+ * Therefore, at list user handler will not hit anymore.
+ */
+ if (!ret)
+ ret = fprobe_addr_list_add(alist, node->addr);
+ }
+}
+
+/* Handle module unloading to manage fprobe_ip_table. */
+static int fprobe_module_callback(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT};
+ struct module *mod = data;
+ int i;
+
+ if (val != MODULE_STATE_GOING)
+ return NOTIFY_DONE;
+
+ alist.addrs = kcalloc(alist.size, sizeof(*alist.addrs), GFP_KERNEL);
+ /* If failed to alloc memory, we can not remove ips from hash. */
+ if (!alist.addrs)
+ return NOTIFY_DONE;
+
+ mutex_lock(&fprobe_mutex);
+ for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++)
+ fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist);
+
+ if (alist.index < alist.size && alist.index > 0)
+ ftrace_set_filter_ips(&fprobe_graph_ops.ops,
+ alist.addrs, alist.index, 1, 0);
+ mutex_unlock(&fprobe_mutex);
+
+ kfree(alist.addrs);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block fprobe_module_nb = {
+ .notifier_call = fprobe_module_callback,
+ .priority = 0,
+};
+
+static int __init init_fprobe_module(void)
+{
+ return register_module_notifier(&fprobe_module_nb);
+}
+early_initcall(init_fprobe_module);
+#endif
+
static int symbols_cmp(const void *a, const void *b)
{
const char **str_a = (const char **) a;
@@ -445,6 +545,7 @@ struct filter_match_data {
size_t index;
size_t size;
unsigned long *addrs;
+ struct module **mods;
};
static int filter_match_callback(void *data, const char *name, unsigned long addr)
@@ -458,30 +559,47 @@ static int filter_match_callback(void *data, const char *name, unsigned long add
if (!ftrace_location(addr))
return 0;
- if (match->addrs)
- match->addrs[match->index] = addr;
+ if (match->addrs) {
+ struct module *mod = __module_text_address(addr);
+ if (mod && !try_module_get(mod))
+ return 0;
+
+ match->mods[match->index] = mod;
+ match->addrs[match->index] = addr;
+ }
match->index++;
return match->index == match->size;
}
/*
* Make IP list from the filter/no-filter glob patterns.
- * Return the number of matched symbols, or -ENOENT.
+ * Return the number of matched symbols, or errno.
+ * If @addrs == NULL, this just counts the number of matched symbols. If @addrs
+ * is passed with an array, we need to pass the an @mods array of the same size
+ * to increment the module refcount for each symbol.
+ * This means we also need to call `module_put` for each element of @mods after
+ * using the @addrs.
*/
-static int ip_list_from_filter(const char *filter, const char *notfilter,
- unsigned long *addrs, size_t size)
+static int get_ips_from_filter(const char *filter, const char *notfilter,
+ unsigned long *addrs, struct module **mods,
+ size_t size)
{
struct filter_match_data match = { .filter = filter, .notfilter = notfilter,
- .index = 0, .size = size, .addrs = addrs};
+ .index = 0, .size = size, .addrs = addrs, .mods = mods};
int ret;
+ if (addrs && !mods)
+ return -EINVAL;
+
ret = kallsyms_on_each_symbol(filter_match_callback, &match);
if (ret < 0)
return ret;
- ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match);
- if (ret < 0)
- return ret;
+ if (IS_ENABLED(CONFIG_MODULES)) {
+ ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match);
+ if (ret < 0)
+ return ret;
+ }
return match.index ?: -ENOENT;
}
@@ -543,24 +661,35 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num)
*/
int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
{
- unsigned long *addrs;
- int ret;
+ unsigned long *addrs __free(kfree) = NULL;
+ struct module **mods __free(kfree) = NULL;
+ int ret, num;
if (!fp || !filter)
return -EINVAL;
- ret = ip_list_from_filter(filter, notfilter, NULL, FPROBE_IPS_MAX);
- if (ret < 0)
- return ret;
+ num = get_ips_from_filter(filter, notfilter, NULL, NULL, FPROBE_IPS_MAX);
+ if (num < 0)
+ return num;
- addrs = kcalloc(ret, sizeof(unsigned long), GFP_KERNEL);
+ addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL);
if (!addrs)
return -ENOMEM;
- ret = ip_list_from_filter(filter, notfilter, addrs, ret);
- if (ret > 0)
- ret = register_fprobe_ips(fp, addrs, ret);
- kfree(addrs);
+ mods = kcalloc(num, sizeof(*mods), GFP_KERNEL);
+ if (!mods)
+ return -ENOMEM;
+
+ ret = get_ips_from_filter(filter, notfilter, addrs, mods, num);
+ if (ret < 0)
+ return ret;
+
+ ret = register_fprobe_ips(fp, addrs, ret);
+
+ for (int i = 0; i < num; i++) {
+ if (mods[i])
+ module_put(mods[i]);
+ }
return ret;
}
EXPORT_SYMBOL_GPL(register_fprobe);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1a48aedb5255..6981830c3128 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1297,6 +1297,8 @@ void ftrace_free_filter(struct ftrace_ops *ops)
return;
free_ftrace_hash(ops->func_hash->filter_hash);
free_ftrace_hash(ops->func_hash->notrace_hash);
+ ops->func_hash->filter_hash = EMPTY_HASH;
+ ops->func_hash->notrace_hash = EMPTY_HASH;
}
EXPORT_SYMBOL_GPL(ftrace_free_filter);
@@ -3256,6 +3258,31 @@ static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash,
}
/*
+ * Remove functions from @hash that are in @notrace_hash
+ */
+static void remove_hash(struct ftrace_hash *hash, struct ftrace_hash *notrace_hash)
+{
+ struct ftrace_func_entry *entry;
+ struct hlist_node *tmp;
+ int size;
+ int i;
+
+ /* If the notrace hash is empty, there's nothing to do */
+ if (ftrace_hash_empty(notrace_hash))
+ return;
+
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) {
+ if (!__ftrace_lookup_ip(notrace_hash, entry->ip))
+ continue;
+ remove_hash_entry(hash, entry);
+ kfree(entry);
+ }
+ }
+}
+
+/*
* Add to @hash only those that are in both @new_hash1 and @new_hash2
*
* The notrace_hash updates uses just the intersect_hash() function
@@ -3295,67 +3322,6 @@ static int intersect_hash(struct ftrace_hash **hash, struct ftrace_hash *new_has
return 0;
}
-/* Return a new hash that has a union of all @ops->filter_hash entries */
-static struct ftrace_hash *append_hashes(struct ftrace_ops *ops)
-{
- struct ftrace_hash *new_hash = NULL;
- struct ftrace_ops *subops;
- int size_bits;
- int ret;
-
- if (ops->func_hash->filter_hash)
- size_bits = ops->func_hash->filter_hash->size_bits;
- else
- size_bits = FTRACE_HASH_DEFAULT_BITS;
-
- list_for_each_entry(subops, &ops->subop_list, list) {
- ret = append_hash(&new_hash, subops->func_hash->filter_hash, size_bits);
- if (ret < 0) {
- free_ftrace_hash(new_hash);
- return NULL;
- }
- /* Nothing more to do if new_hash is empty */
- if (ftrace_hash_empty(new_hash))
- break;
- }
- /* Can't return NULL as that means this failed */
- return new_hash ? : EMPTY_HASH;
-}
-
-/* Make @ops trace evenything except what all its subops do not trace */
-static struct ftrace_hash *intersect_hashes(struct ftrace_ops *ops)
-{
- struct ftrace_hash *new_hash = NULL;
- struct ftrace_ops *subops;
- int size_bits;
- int ret;
-
- list_for_each_entry(subops, &ops->subop_list, list) {
- struct ftrace_hash *next_hash;
-
- if (!new_hash) {
- size_bits = subops->func_hash->notrace_hash->size_bits;
- new_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->notrace_hash);
- if (!new_hash)
- return NULL;
- continue;
- }
- size_bits = new_hash->size_bits;
- next_hash = new_hash;
- new_hash = alloc_ftrace_hash(size_bits);
- ret = intersect_hash(&new_hash, next_hash, subops->func_hash->notrace_hash);
- free_ftrace_hash(next_hash);
- if (ret < 0) {
- free_ftrace_hash(new_hash);
- return NULL;
- }
- /* Nothing more to do if new_hash is empty */
- if (ftrace_hash_empty(new_hash))
- break;
- }
- return new_hash;
-}
-
static bool ops_equal(struct ftrace_hash *A, struct ftrace_hash *B)
{
struct ftrace_func_entry *entry;
@@ -3427,6 +3393,95 @@ static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_
return 0;
}
+static int add_first_hash(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash,
+ struct ftrace_ops_hash *func_hash)
+{
+ /* If the filter hash is not empty, simply remove the nohash from it */
+ if (!ftrace_hash_empty(func_hash->filter_hash)) {
+ *filter_hash = copy_hash(func_hash->filter_hash);
+ if (!*filter_hash)
+ return -ENOMEM;
+ remove_hash(*filter_hash, func_hash->notrace_hash);
+ *notrace_hash = EMPTY_HASH;
+
+ } else {
+ *notrace_hash = copy_hash(func_hash->notrace_hash);
+ if (!*notrace_hash)
+ return -ENOMEM;
+ *filter_hash = EMPTY_HASH;
+ }
+ return 0;
+}
+
+static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash,
+ struct ftrace_ops_hash *ops_hash, struct ftrace_ops_hash *subops_hash)
+{
+ int size_bits;
+ int ret;
+
+ /* If the subops trace all functions so must the main ops */
+ if (ftrace_hash_empty(ops_hash->filter_hash) ||
+ ftrace_hash_empty(subops_hash->filter_hash)) {
+ *filter_hash = EMPTY_HASH;
+ } else {
+ /*
+ * The main ops filter hash is not empty, so its
+ * notrace_hash had better be, as the notrace hash
+ * is only used for empty main filter hashes.
+ */
+ WARN_ON_ONCE(!ftrace_hash_empty(ops_hash->notrace_hash));
+
+ size_bits = max(ops_hash->filter_hash->size_bits,
+ subops_hash->filter_hash->size_bits);
+
+ /* Copy the subops hash */
+ *filter_hash = alloc_and_copy_ftrace_hash(size_bits, subops_hash->filter_hash);
+ if (!*filter_hash)
+ return -ENOMEM;
+ /* Remove any notrace functions from the copy */
+ remove_hash(*filter_hash, subops_hash->notrace_hash);
+
+ ret = append_hash(filter_hash, ops_hash->filter_hash,
+ size_bits);
+ if (ret < 0) {
+ free_ftrace_hash(*filter_hash);
+ *filter_hash = EMPTY_HASH;
+ return ret;
+ }
+ }
+
+ /*
+ * Only process notrace hashes if the main filter hash is empty
+ * (tracing all functions), otherwise the filter hash will just
+ * remove the notrace hash functions, and the notrace hash is
+ * not needed.
+ */
+ if (ftrace_hash_empty(*filter_hash)) {
+ /*
+ * Intersect the notrace functions. That is, if two
+ * subops are not tracing a set of functions, the
+ * main ops will only not trace the functions that are
+ * in both subops, but has to trace the functions that
+ * are only notrace in one of the subops, for the other
+ * subops to be able to trace them.
+ */
+ size_bits = max(ops_hash->notrace_hash->size_bits,
+ subops_hash->notrace_hash->size_bits);
+ *notrace_hash = alloc_ftrace_hash(size_bits);
+ if (!*notrace_hash)
+ return -ENOMEM;
+
+ ret = intersect_hash(notrace_hash, ops_hash->notrace_hash,
+ subops_hash->notrace_hash);
+ if (ret < 0) {
+ free_ftrace_hash(*notrace_hash);
+ *notrace_hash = EMPTY_HASH;
+ return ret;
+ }
+ }
+ return 0;
+}
+
/**
* ftrace_startup_subops - enable tracing for subops of an ops
* @ops: Manager ops (used to pick all the functions of its subops)
@@ -3439,11 +3494,10 @@ static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_
*/
int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
{
- struct ftrace_hash *filter_hash;
- struct ftrace_hash *notrace_hash;
+ struct ftrace_hash *filter_hash = EMPTY_HASH;
+ struct ftrace_hash *notrace_hash = EMPTY_HASH;
struct ftrace_hash *save_filter_hash;
struct ftrace_hash *save_notrace_hash;
- int size_bits;
int ret;
if (unlikely(ftrace_disabled))
@@ -3467,14 +3521,14 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int
/* For the first subops to ops just enable it normally */
if (list_empty(&ops->subop_list)) {
- /* Just use the subops hashes */
- filter_hash = copy_hash(subops->func_hash->filter_hash);
- notrace_hash = copy_hash(subops->func_hash->notrace_hash);
- if (!filter_hash || !notrace_hash) {
- free_ftrace_hash(filter_hash);
- free_ftrace_hash(notrace_hash);
- return -ENOMEM;
- }
+
+ /* The ops was empty, should have empty hashes */
+ WARN_ON_ONCE(!ftrace_hash_empty(ops->func_hash->filter_hash));
+ WARN_ON_ONCE(!ftrace_hash_empty(ops->func_hash->notrace_hash));
+
+ ret = add_first_hash(&filter_hash, &notrace_hash, subops->func_hash);
+ if (ret < 0)
+ return ret;
save_filter_hash = ops->func_hash->filter_hash;
save_notrace_hash = ops->func_hash->notrace_hash;
@@ -3500,48 +3554,16 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int
/*
* Here there's already something attached. Here are the rules:
- * o If either filter_hash is empty then the final stays empty
- * o Otherwise, the final is a superset of both hashes
- * o If either notrace_hash is empty then the final stays empty
- * o Otherwise, the final is an intersection between the hashes
+ * If the new subops and main ops filter hashes are not empty:
+ * o Make a copy of the subops filter hash
+ * o Remove all functions in the nohash from it.
+ * o Add in the main hash filter functions
+ * o Remove any of these functions from the main notrace hash
*/
- if (ftrace_hash_empty(ops->func_hash->filter_hash) ||
- ftrace_hash_empty(subops->func_hash->filter_hash)) {
- filter_hash = EMPTY_HASH;
- } else {
- size_bits = max(ops->func_hash->filter_hash->size_bits,
- subops->func_hash->filter_hash->size_bits);
- filter_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->filter_hash);
- if (!filter_hash)
- return -ENOMEM;
- ret = append_hash(&filter_hash, subops->func_hash->filter_hash,
- size_bits);
- if (ret < 0) {
- free_ftrace_hash(filter_hash);
- return ret;
- }
- }
- if (ftrace_hash_empty(ops->func_hash->notrace_hash) ||
- ftrace_hash_empty(subops->func_hash->notrace_hash)) {
- notrace_hash = EMPTY_HASH;
- } else {
- size_bits = max(ops->func_hash->filter_hash->size_bits,
- subops->func_hash->filter_hash->size_bits);
- notrace_hash = alloc_ftrace_hash(size_bits);
- if (!notrace_hash) {
- free_ftrace_hash(filter_hash);
- return -ENOMEM;
- }
-
- ret = intersect_hash(&notrace_hash, ops->func_hash->filter_hash,
- subops->func_hash->filter_hash);
- if (ret < 0) {
- free_ftrace_hash(filter_hash);
- free_ftrace_hash(notrace_hash);
- return ret;
- }
- }
+ ret = add_next_hash(&filter_hash, &notrace_hash, ops->func_hash, subops->func_hash);
+ if (ret < 0)
+ return ret;
list_add(&subops->list, &ops->subop_list);
@@ -3557,6 +3579,45 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int
return ret;
}
+static int rebuild_hashes(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash,
+ struct ftrace_ops *ops)
+{
+ struct ftrace_ops_hash temp_hash;
+ struct ftrace_ops *subops;
+ bool first = true;
+ int ret;
+
+ temp_hash.filter_hash = EMPTY_HASH;
+ temp_hash.notrace_hash = EMPTY_HASH;
+
+ list_for_each_entry(subops, &ops->subop_list, list) {
+ *filter_hash = EMPTY_HASH;
+ *notrace_hash = EMPTY_HASH;
+
+ if (first) {
+ ret = add_first_hash(filter_hash, notrace_hash, subops->func_hash);
+ if (ret < 0)
+ return ret;
+ first = false;
+ } else {
+ ret = add_next_hash(filter_hash, notrace_hash,
+ &temp_hash, subops->func_hash);
+ if (ret < 0) {
+ free_ftrace_hash(temp_hash.filter_hash);
+ free_ftrace_hash(temp_hash.notrace_hash);
+ return ret;
+ }
+ }
+
+ free_ftrace_hash(temp_hash.filter_hash);
+ free_ftrace_hash(temp_hash.notrace_hash);
+
+ temp_hash.filter_hash = *filter_hash;
+ temp_hash.notrace_hash = *notrace_hash;
+ }
+ return 0;
+}
+
/**
* ftrace_shutdown_subops - Remove a subops from a manager ops
* @ops: A manager ops to remove @subops from
@@ -3571,8 +3632,8 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int
*/
int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
{
- struct ftrace_hash *filter_hash;
- struct ftrace_hash *notrace_hash;
+ struct ftrace_hash *filter_hash = EMPTY_HASH;
+ struct ftrace_hash *notrace_hash = EMPTY_HASH;
int ret;
if (unlikely(ftrace_disabled))
@@ -3605,14 +3666,9 @@ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, in
}
/* Rebuild the hashes without subops */
- filter_hash = append_hashes(ops);
- notrace_hash = intersect_hashes(ops);
- if (!filter_hash || !notrace_hash) {
- free_ftrace_hash(filter_hash);
- free_ftrace_hash(notrace_hash);
- list_add(&subops->list, &ops->subop_list);
- return -ENOMEM;
- }
+ ret = rebuild_hashes(&filter_hash, &notrace_hash, ops);
+ if (ret < 0)
+ return ret;
ret = ftrace_update_ops(ops, filter_hash, notrace_hash);
if (ret < 0) {
@@ -3628,11 +3684,11 @@ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, in
static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops,
struct ftrace_hash **orig_subhash,
- struct ftrace_hash *hash,
- int enable)
+ struct ftrace_hash *hash)
{
struct ftrace_ops *ops = subops->managed;
- struct ftrace_hash **orig_hash;
+ struct ftrace_hash *notrace_hash;
+ struct ftrace_hash *filter_hash;
struct ftrace_hash *save_hash;
struct ftrace_hash *new_hash;
int ret;
@@ -3649,24 +3705,18 @@ static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops,
return -ENOMEM;
}
- /* Create a new_hash to hold the ops new functions */
- if (enable) {
- orig_hash = &ops->func_hash->filter_hash;
- new_hash = append_hashes(ops);
- } else {
- orig_hash = &ops->func_hash->notrace_hash;
- new_hash = intersect_hashes(ops);
+ ret = rebuild_hashes(&filter_hash, &notrace_hash, ops);
+ if (!ret) {
+ ret = ftrace_update_ops(ops, filter_hash, notrace_hash);
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
}
- /* Move the hash over to the new hash */
- ret = __ftrace_hash_move_and_update_ops(ops, orig_hash, new_hash, enable);
-
- free_ftrace_hash(new_hash);
-
if (ret) {
/* Put back the original hash */
- free_ftrace_hash_rcu(*orig_subhash);
+ new_hash = *orig_subhash;
*orig_subhash = save_hash;
+ free_ftrace_hash_rcu(new_hash);
} else {
free_ftrace_hash_rcu(save_hash);
}
@@ -4890,7 +4940,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
int enable)
{
if (ops->flags & FTRACE_OPS_FL_SUBOP)
- return ftrace_hash_move_and_update_subops(ops, orig_hash, hash, enable);
+ return ftrace_hash_move_and_update_subops(ops, orig_hash, hash);
/*
* If this ops is not enabled, it could be sharing its filters
@@ -4909,7 +4959,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
list_for_each_entry(subops, &op->subop_list, list) {
if ((subops->flags & FTRACE_OPS_FL_ENABLED) &&
subops->func_hash == ops->func_hash) {
- return ftrace_hash_move_and_update_subops(subops, orig_hash, hash, enable);
+ return ftrace_hash_move_and_update_subops(subops, orig_hash, hash);
}
}
} while_for_each_ftrace_op(op);
@@ -5914,9 +5964,10 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
/* Make a copy hash to place the new and the old entries in */
size = hash->count + direct_functions->count;
- if (size > 32)
- size = 32;
- new_hash = alloc_ftrace_hash(fls(size));
+ size = fls(size);
+ if (size > FTRACE_HASH_MAX_BITS)
+ size = FTRACE_HASH_MAX_BITS;
+ new_hash = alloc_ftrace_hash(size);
if (!new_hash)
goto out_unlock;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c0f877d39a24..3f9bf562beea 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1887,10 +1887,12 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
head_page = cpu_buffer->head_page;
- /* If both the head and commit are on the reader_page then we are done. */
- if (head_page == cpu_buffer->reader_page &&
- head_page == cpu_buffer->commit_page)
+ /* If the commit_buffer is the reader page, update the commit page */
+ if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) {
+ cpu_buffer->commit_page = cpu_buffer->reader_page;
+ /* Nothing more to do, the only page is the reader page */
goto done;
+ }
/* Iterate until finding the commit page */
for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 968c5c3b0246..e4077500a91d 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -225,7 +225,12 @@ bool rv_is_nested_monitor(struct rv_monitor_def *mdef)
*/
bool rv_is_container_monitor(struct rv_monitor_def *mdef)
{
- struct rv_monitor_def *next = list_next_entry(mdef, list);
+ struct rv_monitor_def *next;
+
+ if (list_is_last(&mdef->list, &rv_monitors_list))
+ return false;
+
+ next = list_next_entry(mdef, list);
return next->parent == mdef->monitor || !mdef->monitor->enable;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b581e388a9d9..5b8db27fb6ef 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6043,8 +6043,10 @@ unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr)
tscratch = tr->scratch;
/* if there is no tscrach, module_delta must be NULL. */
module_delta = READ_ONCE(tr->module_delta);
- if (!module_delta || tscratch->entries[0].mod_addr > addr)
+ if (!module_delta || !tscratch->nr_entries ||
+ tscratch->entries[0].mod_addr > addr) {
return addr + tr->text_delta;
+ }
/* Note that entries must be sorted. */
nr_entries = tscratch->nr_entries;
@@ -6821,13 +6823,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
/* Copy the data into the page, so we can start over. */
ret = trace_seq_to_buffer(&iter->seq,
page_address(spd.pages[i]),
- trace_seq_used(&iter->seq));
+ min((size_t)trace_seq_used(&iter->seq),
+ PAGE_SIZE));
if (ret < 0) {
__free_page(spd.pages[i]);
break;
}
spd.partial[i].offset = 0;
- spd.partial[i].len = trace_seq_used(&iter->seq);
+ spd.partial[i].len = ret;
trace_seq_init(&iter->seq);
}
@@ -9806,6 +9809,7 @@ static int instance_mkdir(const char *name)
return ret;
}
+#ifdef CONFIG_MMU
static u64 map_pages(unsigned long start, unsigned long size)
{
unsigned long vmap_start, vmap_end;
@@ -9828,6 +9832,12 @@ static u64 map_pages(unsigned long start, unsigned long size)
return (u64)vmap_start;
}
+#else
+static inline u64 map_pages(unsigned long start, unsigned long size)
+{
+ return 0;
+}
+#endif
/**
* trace_array_get_by_name - Create/Lookup a trace array, given its name.
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index a322e4f249a5..5d64a18cacac 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -16,7 +16,7 @@
#include "trace_output.h" /* for trace_event_sem */
#include "trace_dynevent.h"
-static DEFINE_MUTEX(dyn_event_ops_mutex);
+DEFINE_MUTEX(dyn_event_ops_mutex);
static LIST_HEAD(dyn_event_ops_list);
bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call)
@@ -116,6 +116,20 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
return ret;
}
+/*
+ * Locked version of event creation. The event creation must be protected by
+ * dyn_event_ops_mutex because of protecting trace_probe_log.
+ */
+int dyn_event_create(const char *raw_command, struct dyn_event_operations *type)
+{
+ int ret;
+
+ mutex_lock(&dyn_event_ops_mutex);
+ ret = type->create(raw_command);
+ mutex_unlock(&dyn_event_ops_mutex);
+ return ret;
+}
+
static int create_dyn_event(const char *raw_command)
{
struct dyn_event_operations *ops;
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
index 936477a111d3..beee3f8d7544 100644
--- a/kernel/trace/trace_dynevent.h
+++ b/kernel/trace/trace_dynevent.h
@@ -100,6 +100,7 @@ void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos);
void dyn_event_seq_stop(struct seq_file *m, void *v);
int dyn_events_release_all(struct dyn_event_operations *type);
int dyn_event_release(const char *raw_command, struct dyn_event_operations *type);
+int dyn_event_create(const char *raw_command, struct dyn_event_operations *type);
/*
* for_each_dyn_event - iterate over the dyn_event list
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ee40d4e6ad1c..4ef4df6623a8 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -80,11 +80,11 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ent, graph_ent )
__field_packed( unsigned long, graph_ent, func )
- __field_packed( unsigned long, graph_ent, depth )
+ __field_packed( unsigned int, graph_ent, depth )
__dynamic_array(unsigned long, args )
),
- F_printk("--> %ps (%lu)", (void *)__entry->func, __entry->depth)
+ F_printk("--> %ps (%u)", (void *)__entry->func, __entry->depth)
);
#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index c08355c3ef32..916555f0de81 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -969,10 +969,13 @@ static int __trace_eprobe_create(int argc, const char *argv[])
goto error;
}
}
+ trace_probe_log_clear();
return ret;
+
parse_error:
ret = -EINVAL;
error:
+ trace_probe_log_clear();
trace_event_probe_cleanup(ep);
return ret;
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 0993dfc1c5c1..2048560264bb 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -808,7 +808,7 @@ static __always_inline char *test_string(char *str)
kstr = ubuf->buffer;
/* For safety, do not trust the string pointer */
- if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE))
+ if (strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE) < 0)
return NULL;
return kstr;
}
@@ -827,7 +827,7 @@ static __always_inline char *test_ustring(char *str)
/* user space address? */
ustr = (char __user *)str;
- if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE))
+ if (strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE) < 0)
return NULL;
return kstr;
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 969f48742d72..33cfbd4ed76d 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -370,7 +370,6 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
union trace_synth_field *data = &entry->fields[n_u64];
trace_seq_printf(s, print_fmt, se->fields[i]->name,
- STR_VAR_LEN_MAX,
(char *)entry + data->as_dynamic.offset,
i == se->n_fields - 1 ? "" : " ");
n_u64++;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index b66b6d235d91..6e87ae2a1a66 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1560,7 +1560,7 @@ stacktrace_trigger(struct event_trigger_data *data,
struct trace_event_file *file = data->private_data;
if (file)
- __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP);
+ __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP);
else
trace_dump_stack(STACK_SKIP);
}
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index 5d7ca80173ea..b40fa59159ac 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -919,9 +919,15 @@ static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mo
struct __find_tracepoint_cb_data *data = priv;
if (!data->tpoint && !strcmp(data->tp_name, tp->name)) {
- data->tpoint = tp;
- if (!data->mod)
+ /* If module is not specified, try getting module refcount. */
+ if (!data->mod && mod) {
+ /* If failed to get refcount, ignore this tracepoint. */
+ if (!try_module_get(mod))
+ return;
+
data->mod = mod;
+ }
+ data->tpoint = tp;
}
}
@@ -933,7 +939,11 @@ static void __find_tracepoint_cb(struct tracepoint *tp, void *priv)
data->tpoint = tp;
}
-/* Find a tracepoint from kernel and module. */
+/*
+ * Find a tracepoint from kernel and module. If the tracepoint is on the module,
+ * the module's refcount is incremented and returned as *@tp_mod. Thus, if it is
+ * not NULL, caller must call module_put(*tp_mod) after used the tracepoint.
+ */
static struct tracepoint *find_tracepoint(const char *tp_name,
struct module **tp_mod)
{
@@ -962,7 +972,10 @@ static void reenable_trace_fprobe(struct trace_fprobe *tf)
}
}
-/* Find a tracepoint from specified module. */
+/*
+ * Find a tracepoint from specified module. In this case, this does not get the
+ * module's refcount. The caller must ensure the module is not freed.
+ */
static struct tracepoint *find_tracepoint_in_module(struct module *mod,
const char *tp_name)
{
@@ -1169,11 +1182,6 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
if (is_tracepoint) {
ctx->flags |= TPARG_FL_TPOINT;
tpoint = find_tracepoint(symbol, &tp_mod);
- /* lock module until register this tprobe. */
- if (tp_mod && !try_module_get(tp_mod)) {
- tpoint = NULL;
- tp_mod = NULL;
- }
if (tpoint) {
ctx->funcname = kallsyms_lookup(
(unsigned long)tpoint->probestub,
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 98ccf3f00c51..4e37a0f6aaa3 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -633,11 +633,7 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
static __always_inline void trace_stack(struct trace_array *tr)
{
- unsigned int trace_ctx;
-
- trace_ctx = tracing_gen_ctx();
-
- __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP);
+ __trace_stack(tr, tracing_gen_ctx_dec(), FTRACE_STACK_SKIP);
}
static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 2f077d4158e5..0c357a89c58e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -880,8 +880,6 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr
if (print_retval || print_retaddr)
trace_seq_puts(s, " /*");
- else
- trace_seq_putc(s, '\n');
} else {
print_retaddr = false;
trace_seq_printf(s, "} /* %ps", func);
@@ -899,7 +897,7 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr
}
if (!entry || print_retval || print_retaddr)
- trace_seq_puts(s, " */\n");
+ trace_seq_puts(s, " */");
}
#else
@@ -975,7 +973,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
} else
trace_seq_puts(s, "();");
}
- trace_seq_printf(s, "\n");
+ trace_seq_putc(s, '\n');
print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET,
cpu, iter->ent->pid, flags);
@@ -1313,10 +1311,11 @@ print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s,
* that if the funcgraph-tail option is enabled.
*/
if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
- trace_seq_puts(s, "}\n");
+ trace_seq_puts(s, "}");
else
- trace_seq_printf(s, "} /* %ps */\n", (void *)func);
+ trace_seq_printf(s, "} /* %ps */", (void *)func);
}
+ trace_seq_putc(s, '\n');
/* Overrun */
if (flags & TRACE_GRAPH_PRINT_OVERRUN)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2703b96d8990..3e5c47b6d7b2 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1089,7 +1089,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command)
if (raw_command[0] == '-')
return dyn_event_release(raw_command, &trace_kprobe_ops);
- ret = trace_kprobe_create(raw_command);
+ ret = dyn_event_create(raw_command, &trace_kprobe_ops);
return ret == -ECANCELED ? -EINVAL : ret;
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index fee40ffbd490..b9ab06c99543 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1042,11 +1042,12 @@ enum print_line_t print_event_fields(struct trace_iterator *iter,
struct trace_event_call *call;
struct list_head *head;
+ lockdep_assert_held_read(&trace_event_sem);
+
/* ftrace defined events have separate call structures */
if (event->type <= __TRACE_LAST_TYPE) {
bool found = false;
- down_read(&trace_event_sem);
list_for_each_entry(call, &ftrace_events, list) {
if (call->event.type == event->type) {
found = true;
@@ -1056,7 +1057,6 @@ enum print_line_t print_event_fields(struct trace_iterator *iter,
if (call->event.type > __TRACE_LAST_TYPE)
break;
}
- up_read(&trace_event_sem);
if (!found) {
trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type);
goto out;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 2eeecb6c95ee..424751cdf31f 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -154,9 +154,12 @@ fail:
}
static struct trace_probe_log trace_probe_log;
+extern struct mutex dyn_event_ops_mutex;
void trace_probe_log_init(const char *subsystem, int argc, const char **argv)
{
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
trace_probe_log.subsystem = subsystem;
trace_probe_log.argc = argc;
trace_probe_log.argv = argv;
@@ -165,11 +168,15 @@ void trace_probe_log_init(const char *subsystem, int argc, const char **argv)
void trace_probe_log_clear(void)
{
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
memset(&trace_probe_log, 0, sizeof(trace_probe_log));
}
void trace_probe_log_set_index(int index)
{
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
trace_probe_log.index = index;
}
@@ -178,6 +185,8 @@ void __trace_probe_log_err(int offset, int err_type)
char *command, *p;
int i, len = 0, pos = 0;
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
if (!trace_probe_log.argv)
return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 3386439ec9f6..35cf76c75dd7 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -741,7 +741,7 @@ static int create_or_delete_trace_uprobe(const char *raw_command)
if (raw_command[0] == '-')
return dyn_event_release(raw_command, &trace_uprobe_ops);
- ret = trace_uprobe_create(raw_command);
+ ret = dyn_event_create(raw_command, &trace_uprobe_ops);
return ret == -ECANCELED ? -EINVAL : ret;
}
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
index 2ef2e1b80091..2f844c279a3e 100644
--- a/kernel/vhost_task.c
+++ b/kernel/vhost_task.c
@@ -111,7 +111,7 @@ EXPORT_SYMBOL_GPL(vhost_task_stop);
* @arg: data to be passed to fn and handled_kill
* @name: the thread's name
*
- * This returns a specialized task for use by the vhost layer or NULL on
+ * This returns a specialized task for use by the vhost layer or ERR_PTR() on
* failure. The returned task is inactive, and the caller must fire it up
* through vhost_task_start().
*/