summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c11
-rw-r--r--kernel/bpf/bpf_lru_list.c9
-rw-r--r--kernel/bpf/bpf_lru_list.h1
-rw-r--r--kernel/bpf/bpf_struct_ops.c100
-rw-r--r--kernel/bpf/btf.c4
-rw-r--r--kernel/bpf/cgroup.c33
-rw-r--r--kernel/bpf/core.c96
-rw-r--r--kernel/bpf/crypto.c2
-rw-r--r--kernel/bpf/hashtab.c8
-rw-r--r--kernel/bpf/helpers.c32
-rw-r--r--kernel/bpf/preload/Kconfig1
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c1
-rw-r--r--kernel/bpf/syscall.c32
-rw-r--r--kernel/bpf/verifier.c280
-rw-r--r--kernel/cgroup/cgroup.c74
-rw-r--r--kernel/cgroup/cpuset-internal.h1
-rw-r--r--kernel/cgroup/cpuset.c29
-rw-r--r--kernel/cgroup/legacy_freezer.c11
-rw-r--r--kernel/cgroup/rstat.c12
-rw-r--r--kernel/dma/contiguous.c5
-rw-r--r--kernel/dma/debug.c135
-rw-r--r--kernel/dma/debug.h20
-rw-r--r--kernel/dma/mapping.c66
-rw-r--r--kernel/dma/pool.c4
-rw-r--r--kernel/events/core.c260
-rw-r--r--kernel/events/hw_breakpoint.c5
-rw-r--r--kernel/events/ring_buffer.c1
-rw-r--r--kernel/exit.c23
-rw-r--r--kernel/fork.c9
-rw-r--r--kernel/freezer.c15
-rw-r--r--kernel/irq/irq_sim.c2
-rw-r--r--kernel/irq/msi.c2
-rw-r--r--kernel/kcsan/kcsan_test.c2
-rw-r--r--kernel/module/Kconfig1
-rw-r--r--kernel/module/main.c16
-rw-r--r--kernel/padata.c3
-rw-r--r--kernel/panic.c6
-rw-r--r--kernel/params.c10
-rw-r--r--kernel/power/console.c7
-rw-r--r--kernel/power/energy_model.c51
-rw-r--r--kernel/power/hibernate.c5
-rw-r--r--kernel/power/main.c3
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/wakelock.c3
-rw-r--r--kernel/printk/nbcon.c63
-rw-r--r--kernel/printk/printk.c14
-rw-r--r--kernel/rcu/rcu.h2
-rw-r--r--kernel/rcu/refscale.c10
-rw-r--r--kernel/rcu/tree.c31
-rw-r--r--kernel/rcu/tree.h16
-rw-r--r--kernel/rcu/tree_nocb.h7
-rw-r--r--kernel/rcu/tree_plugin.h66
-rw-r--r--kernel/rcu/tree_stall.h4
-rw-r--r--kernel/resource.c5
-rw-r--r--kernel/rseq.c60
-rw-r--r--kernel/sched/core.c25
-rw-r--r--kernel/sched/cpufreq_schedutil.c46
-rw-r--r--kernel/sched/deadline.c14
-rw-r--r--kernel/sched/debug.c7
-rw-r--r--kernel/sched/ext.c45
-rw-r--r--kernel/sched/ext.h2
-rw-r--r--kernel/sched/fair.c146
-rw-r--r--kernel/sched/loadavg.c2
-rw-r--r--kernel/sched/pelt.c4
-rw-r--r--kernel/sched/psi.c123
-rw-r--r--kernel/sched/rt.c6
-rw-r--r--kernel/sched/sched.h7
-rw-r--r--kernel/sched/topology.c2
-rw-r--r--kernel/signal.c3
-rw-r--r--kernel/softirq.c18
-rw-r--r--kernel/stop_machine.c20
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/hrtimer.c40
-rw-r--r--kernel/time/posix-cpu-timers.c9
-rw-r--r--kernel/time/posix-timers.c22
-rw-r--r--kernel/time/tick-common.c22
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/trace/bpf_trace.c14
-rw-r--r--kernel/trace/fgraph.c2
-rw-r--r--kernel/trace/ftrace.c36
-rw-r--r--kernel/trace/preemptirq_delay_test.c13
-rw-r--r--kernel/trace/ring_buffer.c112
-rw-r--r--kernel/trace/trace.c78
-rw-r--r--kernel/trace/trace.h34
-rw-r--r--kernel/trace/trace_dynevent.c16
-rw-r--r--kernel/trace/trace_dynevent.h1
-rw-r--r--kernel/trace/trace_events.c12
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_events_hist.c122
-rw-r--r--kernel/trace/trace_events_trigger.c22
-rw-r--r--kernel/trace/trace_functions.c6
-rw-r--r--kernel/trace/trace_kdb.c8
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/trace/trace_osnoise.c2
-rw-r--r--kernel/trace/trace_output.c4
-rw-r--r--kernel/trace/trace_probe.c11
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/ucount.c2
-rw-r--r--kernel/vhost_task.c2
-rw-r--r--kernel/watchdog.c41
-rw-r--r--kernel/workqueue.c3
104 files changed, 1832 insertions, 965 deletions
diff --git a/kernel/audit.h b/kernel/audit.h
index a60d2840559e..5156ecd35457 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -199,7 +199,7 @@ struct audit_context {
int argc;
} execve;
struct {
- char *name;
+ const char *name;
} module;
struct {
struct audit_ntp_data ntp_data;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cd57053b4a69..dae80e4dfcce 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2870,7 +2870,7 @@ void __audit_openat2_how(struct open_how *how)
context->type = AUDIT_OPENAT2;
}
-void __audit_log_kern_module(char *name)
+void __audit_log_kern_module(const char *name)
{
struct audit_context *context = audit_context();
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 6547fb7ac0dc..129a51b1da1b 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -162,6 +162,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
void *, value, u64, flags, gfp_t, gfp_flags)
{
struct bpf_local_storage_data *sdata;
+ bool nobusy;
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
@@ -170,21 +171,21 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
if (!cgroup)
return (unsigned long)NULL;
- if (!bpf_cgrp_storage_trylock())
- return (unsigned long)NULL;
+ nobusy = bpf_cgrp_storage_trylock();
- sdata = cgroup_storage_lookup(cgroup, map, true);
+ sdata = cgroup_storage_lookup(cgroup, map, nobusy);
if (sdata)
goto unlock;
/* only allocate new storage, when the cgroup is refcounted */
if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
- (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy)
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
value, BPF_NOEXIST, gfp_flags);
unlock:
- bpf_cgrp_storage_unlock();
+ if (nobusy)
+ bpf_cgrp_storage_unlock();
return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
}
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index 3dabdd137d10..2d6e1c98d8ad 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -337,12 +337,12 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
list) {
__bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
BPF_LRU_LOCAL_LIST_T_FREE);
- if (++nfree == LOCAL_FREE_TARGET)
+ if (++nfree == lru->target_free)
break;
}
- if (nfree < LOCAL_FREE_TARGET)
- __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree,
+ if (nfree < lru->target_free)
+ __bpf_lru_list_shrink(lru, l, lru->target_free - nfree,
local_free_list(loc_l),
BPF_LRU_LOCAL_LIST_T_FREE);
@@ -577,6 +577,9 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
buf += elem_size;
}
+
+ lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2,
+ 1, LOCAL_FREE_TARGET);
}
static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index cbd8d3720c2b..fe2661a58ea9 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -58,6 +58,7 @@ struct bpf_lru {
del_from_htab_func del_from_htab;
void *del_arg;
unsigned int hash_offset;
+ unsigned int target_free;
unsigned int nr_scans;
bool percpu;
};
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index b70d0eef8a28..2285b27ce68c 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -147,39 +147,6 @@ void bpf_struct_ops_image_free(void *image)
}
#define MAYBE_NULL_SUFFIX "__nullable"
-#define MAX_STUB_NAME 128
-
-/* Return the type info of a stub function, if it exists.
- *
- * The name of a stub function is made up of the name of the struct_ops and
- * the name of the function pointer member, separated by "__". For example,
- * if the struct_ops type is named "foo_ops" and the function pointer
- * member is named "bar", the stub function name would be "foo_ops__bar".
- */
-static const struct btf_type *
-find_stub_func_proto(const struct btf *btf, const char *st_op_name,
- const char *member_name)
-{
- char stub_func_name[MAX_STUB_NAME];
- const struct btf_type *func_type;
- s32 btf_id;
- int cp;
-
- cp = snprintf(stub_func_name, MAX_STUB_NAME, "%s__%s",
- st_op_name, member_name);
- if (cp >= MAX_STUB_NAME) {
- pr_warn("Stub function name too long\n");
- return NULL;
- }
- btf_id = btf_find_by_name_kind(btf, stub_func_name, BTF_KIND_FUNC);
- if (btf_id < 0)
- return NULL;
- func_type = btf_type_by_id(btf, btf_id);
- if (!func_type)
- return NULL;
-
- return btf_type_by_id(btf, func_type->type); /* FUNC_PROTO */
-}
/* Prepare argument info for every nullable argument of a member of a
* struct_ops type.
@@ -204,27 +171,42 @@ find_stub_func_proto(const struct btf *btf, const char *st_op_name,
static int prepare_arg_info(struct btf *btf,
const char *st_ops_name,
const char *member_name,
- const struct btf_type *func_proto,
+ const struct btf_type *func_proto, void *stub_func_addr,
struct bpf_struct_ops_arg_info *arg_info)
{
const struct btf_type *stub_func_proto, *pointed_type;
const struct btf_param *stub_args, *args;
struct bpf_ctx_arg_aux *info, *info_buf;
u32 nargs, arg_no, info_cnt = 0;
+ char ksym[KSYM_SYMBOL_LEN];
+ const char *stub_fname;
+ s32 stub_func_id;
u32 arg_btf_id;
int offset;
- stub_func_proto = find_stub_func_proto(btf, st_ops_name, member_name);
- if (!stub_func_proto)
- return 0;
+ stub_fname = kallsyms_lookup((unsigned long)stub_func_addr, NULL, NULL, NULL, ksym);
+ if (!stub_fname) {
+ pr_warn("Cannot find the stub function name for the %s in struct %s\n",
+ member_name, st_ops_name);
+ return -ENOENT;
+ }
+
+ stub_func_id = btf_find_by_name_kind(btf, stub_fname, BTF_KIND_FUNC);
+ if (stub_func_id < 0) {
+ pr_warn("Cannot find the stub function %s in btf\n", stub_fname);
+ return -ENOENT;
+ }
+
+ stub_func_proto = btf_type_by_id(btf, stub_func_id);
+ stub_func_proto = btf_type_by_id(btf, stub_func_proto->type);
/* Check if the number of arguments of the stub function is the same
* as the number of arguments of the function pointer.
*/
nargs = btf_type_vlen(func_proto);
if (nargs != btf_type_vlen(stub_func_proto)) {
- pr_warn("the number of arguments of the stub function %s__%s does not match the number of arguments of the member %s of struct %s\n",
- st_ops_name, member_name, member_name, st_ops_name);
+ pr_warn("the number of arguments of the stub function %s does not match the number of arguments of the member %s of struct %s\n",
+ stub_fname, member_name, st_ops_name);
return -EINVAL;
}
@@ -254,21 +236,21 @@ static int prepare_arg_info(struct btf *btf,
&arg_btf_id);
if (!pointed_type ||
!btf_type_is_struct(pointed_type)) {
- pr_warn("stub function %s__%s has %s tagging to an unsupported type\n",
- st_ops_name, member_name, MAYBE_NULL_SUFFIX);
+ pr_warn("stub function %s has %s tagging to an unsupported type\n",
+ stub_fname, MAYBE_NULL_SUFFIX);
goto err_out;
}
offset = btf_ctx_arg_offset(btf, func_proto, arg_no);
if (offset < 0) {
- pr_warn("stub function %s__%s has an invalid trampoline ctx offset for arg#%u\n",
- st_ops_name, member_name, arg_no);
+ pr_warn("stub function %s has an invalid trampoline ctx offset for arg#%u\n",
+ stub_fname, arg_no);
goto err_out;
}
if (args[arg_no].type != stub_args[arg_no].type) {
- pr_warn("arg#%u type in stub function %s__%s does not match with its original func_proto\n",
- arg_no, st_ops_name, member_name);
+ pr_warn("arg#%u type in stub function %s does not match with its original func_proto\n",
+ arg_no, stub_fname);
goto err_out;
}
@@ -325,6 +307,13 @@ static bool is_module_member(const struct btf *btf, u32 id)
return !strcmp(btf_name_by_offset(btf, t->name_off), "module");
}
+int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff)
+{
+ void *func_ptr = *(void **)(st_ops->cfi_stubs + moff);
+
+ return func_ptr ? 0 : -ENOTSUPP;
+}
+
int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
struct btf *btf,
struct bpf_verifier_log *log)
@@ -388,7 +377,10 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
for_each_member(i, t, member) {
const struct btf_type *func_proto;
+ void **stub_func_addr;
+ u32 moff;
+ moff = __btf_member_bit_offset(t, member) / 8;
mname = btf_name_by_offset(btf, member->name_off);
if (!*mname) {
pr_warn("anon member in struct %s is not supported\n",
@@ -414,7 +406,11 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
func_proto = btf_type_resolve_func_ptr(btf,
member->type,
NULL);
- if (!func_proto)
+
+ /* The member is not a function pointer or
+ * the function pointer is not supported.
+ */
+ if (!func_proto || bpf_struct_ops_supported(st_ops, moff))
continue;
if (btf_distill_func_proto(log, btf,
@@ -426,8 +422,9 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
goto errout;
}
+ stub_func_addr = *(void **)(st_ops->cfi_stubs + moff);
err = prepare_arg_info(btf, st_ops->name, mname,
- func_proto,
+ func_proto, stub_func_addr,
arg_info + i);
if (err)
goto errout;
@@ -580,7 +577,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
if (model->ret_size > 0)
flags |= BPF_TRAMP_F_RET_FENTRY_RET;
- size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
+ size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func);
if (size <= 0)
return size ? : -EFAULT;
@@ -1153,13 +1150,6 @@ void bpf_struct_ops_put(const void *kdata)
bpf_map_put(&st_map->map);
}
-int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff)
-{
- void *func_ptr = *(void **)(st_ops->cfi_stubs + moff);
-
- return func_ptr ? 0 : -ENOTSUPP;
-}
-
static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2c54c148a94f..f83bd019db14 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6684,10 +6684,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* Is this a func with potential NULL args? */
if (strcmp(tname, raw_tp_null_args[i].func))
continue;
- if (raw_tp_null_args[i].mask & (0x1 << (arg * 4)))
+ if (raw_tp_null_args[i].mask & (0x1ULL << (arg * 4)))
info->reg_type |= PTR_MAYBE_NULL;
/* Is the current arg IS_ERR? */
- if (raw_tp_null_args[i].mask & (0x2 << (arg * 4)))
+ if (raw_tp_null_args[i].mask & (0x2ULL << (arg * 4)))
ptr_err_raw_tp = true;
break;
}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 025d7e2214ae..c0d606c40195 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -369,7 +369,7 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
/* count number of elements in the list.
* it's slow but the list cannot be long
*/
-static u32 prog_list_length(struct hlist_head *head)
+static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt)
{
struct bpf_prog_list *pl;
u32 cnt = 0;
@@ -377,6 +377,8 @@ static u32 prog_list_length(struct hlist_head *head)
hlist_for_each_entry(pl, head, node) {
if (!prog_list_prog(pl))
continue;
+ if (preorder_cnt && (pl->flags & BPF_F_PREORDER))
+ (*preorder_cnt)++;
cnt++;
}
return cnt;
@@ -400,7 +402,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
if (flags & BPF_F_ALLOW_MULTI)
return true;
- cnt = prog_list_length(&p->bpf.progs[atype]);
+ cnt = prog_list_length(&p->bpf.progs[atype], NULL);
WARN_ON_ONCE(cnt > 1);
if (cnt == 1)
return !!(flags & BPF_F_ALLOW_OVERRIDE);
@@ -423,12 +425,12 @@ static int compute_effective_progs(struct cgroup *cgrp,
struct bpf_prog_array *progs;
struct bpf_prog_list *pl;
struct cgroup *p = cgrp;
- int cnt = 0;
+ int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart;
/* count number of effective programs by walking parents */
do {
if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
- cnt += prog_list_length(&p->bpf.progs[atype]);
+ cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt);
p = cgroup_parent(p);
} while (p);
@@ -439,20 +441,34 @@ static int compute_effective_progs(struct cgroup *cgrp,
/* populate the array with effective progs */
cnt = 0;
p = cgrp;
+ fstart = preorder_cnt;
+ bstart = preorder_cnt - 1;
do {
if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
continue;
+ init_bstart = bstart;
hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
if (!prog_list_prog(pl))
continue;
- item = &progs->items[cnt];
+ if (pl->flags & BPF_F_PREORDER) {
+ item = &progs->items[bstart];
+ bstart--;
+ } else {
+ item = &progs->items[fstart];
+ fstart++;
+ }
item->prog = prog_list_prog(pl);
bpf_cgroup_storages_assign(item->cgroup_storage,
pl->storage);
cnt++;
}
+
+ /* reverse pre-ordering progs at this cgroup level */
+ for (i = bstart + 1, j = init_bstart; i < j; i++, j--)
+ swap(progs->items[i], progs->items[j]);
+
} while ((p = cgroup_parent(p)));
*array = progs;
@@ -663,7 +679,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
*/
return -EPERM;
- if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+ if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS)
return -E2BIG;
pl = find_attach_entry(progs, prog, link, replace_prog,
@@ -698,6 +714,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
pl->prog = prog;
pl->link = link;
+ pl->flags = flags;
bpf_cgroup_storages_assign(pl->storage, storage);
cgrp->bpf.flags[atype] = saved_flags;
@@ -1073,7 +1090,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
lockdep_is_held(&cgroup_mutex));
total_cnt += bpf_prog_array_length(effective);
} else {
- total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
+ total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL);
}
}
@@ -1105,7 +1122,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
u32 id;
progs = &cgrp->bpf.progs[atype];
- cnt = min_t(int, prog_list_length(progs), total_cnt);
+ cnt = min_t(int, prog_list_length(progs, NULL), total_cnt);
i = 0;
hlist_for_each_entry(pl, progs, node) {
prog = prog_list_prog(pl);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a60a6a2ce0d7..9380e0fd5e4a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -778,7 +778,10 @@ bool is_bpf_text_address(unsigned long addr)
struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
- struct bpf_ksym *ksym = bpf_ksym_find(addr);
+ struct bpf_ksym *ksym;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ ksym = bpf_ksym_find(addr);
return ksym && ksym->prog ?
container_of(ksym, struct bpf_prog_aux, ksym)->prog :
@@ -2296,47 +2299,54 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
const struct bpf_insn *insn)
{
/* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
- * is not working properly, or interpreter is being used when
- * prog->jit_requested is not 0, so warn about it!
+ * is not working properly, so warn about it!
*/
WARN_ON_ONCE(1);
return 0;
}
-bool bpf_prog_map_compatible(struct bpf_map *map,
- const struct bpf_prog *fp)
+static bool __bpf_prog_map_compatible(struct bpf_map *map,
+ const struct bpf_prog *fp)
{
enum bpf_prog_type prog_type = resolve_prog_type(fp);
- bool ret;
struct bpf_prog_aux *aux = fp->aux;
+ enum bpf_cgroup_storage_type i;
+ bool ret = false;
+ u64 cookie;
if (fp->kprobe_override)
- return false;
-
- /* XDP programs inserted into maps are not guaranteed to run on
- * a particular netdev (and can run outside driver context entirely
- * in the case of devmap and cpumap). Until device checks
- * are implemented, prohibit adding dev-bound programs to program maps.
- */
- if (bpf_prog_is_dev_bound(aux))
- return false;
+ return ret;
- spin_lock(&map->owner.lock);
- if (!map->owner.type) {
- /* There's no owner yet where we could check for
- * compatibility.
- */
- map->owner.type = prog_type;
- map->owner.jited = fp->jited;
- map->owner.xdp_has_frags = aux->xdp_has_frags;
- map->owner.attach_func_proto = aux->attach_func_proto;
+ spin_lock(&map->owner_lock);
+ /* There's no owner yet where we could check for compatibility. */
+ if (!map->owner) {
+ map->owner = bpf_map_owner_alloc(map);
+ if (!map->owner)
+ goto err;
+ map->owner->type = prog_type;
+ map->owner->jited = fp->jited;
+ map->owner->xdp_has_frags = aux->xdp_has_frags;
+ map->owner->attach_func_proto = aux->attach_func_proto;
+ for_each_cgroup_storage_type(i) {
+ map->owner->storage_cookie[i] =
+ aux->cgroup_storage[i] ?
+ aux->cgroup_storage[i]->cookie : 0;
+ }
ret = true;
} else {
- ret = map->owner.type == prog_type &&
- map->owner.jited == fp->jited &&
- map->owner.xdp_has_frags == aux->xdp_has_frags;
+ ret = map->owner->type == prog_type &&
+ map->owner->jited == fp->jited &&
+ map->owner->xdp_has_frags == aux->xdp_has_frags;
+ for_each_cgroup_storage_type(i) {
+ if (!ret)
+ break;
+ cookie = aux->cgroup_storage[i] ?
+ aux->cgroup_storage[i]->cookie : 0;
+ ret = map->owner->storage_cookie[i] == cookie ||
+ !cookie;
+ }
if (ret &&
- map->owner.attach_func_proto != aux->attach_func_proto) {
+ map->owner->attach_func_proto != aux->attach_func_proto) {
switch (prog_type) {
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_LSM:
@@ -2349,11 +2359,24 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
}
}
}
- spin_unlock(&map->owner.lock);
-
+err:
+ spin_unlock(&map->owner_lock);
return ret;
}
+bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp)
+{
+ /* XDP programs inserted into maps are not guaranteed to run on
+ * a particular netdev (and can run outside driver context entirely
+ * in the case of devmap and cpumap). Until device checks
+ * are implemented, prohibit adding dev-bound programs to program maps.
+ */
+ if (bpf_prog_is_dev_bound(fp->aux))
+ return false;
+
+ return __bpf_prog_map_compatible(map, fp);
+}
+
static int bpf_check_tail_call(const struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
@@ -2366,7 +2389,7 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
if (!map_type_contains_progs(map))
continue;
- if (!bpf_prog_map_compatible(map, fp)) {
+ if (!__bpf_prog_map_compatible(map, fp)) {
ret = -EINVAL;
goto out;
}
@@ -2377,8 +2400,9 @@ out:
return ret;
}
-static void bpf_prog_select_func(struct bpf_prog *fp)
+static bool bpf_prog_select_interpreter(struct bpf_prog *fp)
{
+ bool select_interpreter = false;
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
u32 idx = (round_up(stack_depth, 32) / 32) - 1;
@@ -2387,15 +2411,16 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
* But for non-JITed programs, we don't need bpf_func, so no bounds
* check needed.
*/
- if (!fp->jit_requested &&
- !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) {
+ if (idx < ARRAY_SIZE(interpreters)) {
fp->bpf_func = interpreters[idx];
+ select_interpreter = true;
} else {
fp->bpf_func = __bpf_prog_ret0_warn;
}
#else
fp->bpf_func = __bpf_prog_ret0_warn;
#endif
+ return select_interpreter;
}
/**
@@ -2423,7 +2448,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
bpf_prog_has_kfunc_call(fp))
jit_needed = true;
- bpf_prog_select_func(fp);
+ if (!bpf_prog_select_interpreter(fp))
+ jit_needed = true;
/* eBPF JITs can rewrite the program in case constant
* blinding is active. However, in case of error during
diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c
index 94854cd9c4cc..83c4d9943084 100644
--- a/kernel/bpf/crypto.c
+++ b/kernel/bpf/crypto.c
@@ -278,7 +278,7 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,
siv_len = siv ? __bpf_dynptr_size(siv) : 0;
src_len = __bpf_dynptr_size(src);
dst_len = __bpf_dynptr_size(dst);
- if (!src_len || !dst_len)
+ if (!src_len || !dst_len || src_len > dst_len)
return -EINVAL;
if (siv_len != ctx->siv_len)
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3ec941a0ea41..570e2f723144 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -198,12 +198,12 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{
- *(void __percpu **)(l->key + key_size) = pptr;
+ *(void __percpu **)(l->key + roundup(key_size, 8)) = pptr;
}
static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
{
- return *(void __percpu **)(l->key + key_size);
+ return *(void __percpu **)(l->key + roundup(key_size, 8));
}
static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
@@ -2223,7 +2223,7 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
b = &htab->buckets[i];
rcu_read_lock();
head = &b->head;
- hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
+ hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) {
key = elem->key;
if (is_percpu) {
/* current cpu value for percpu map */
@@ -2355,7 +2355,7 @@ static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn
*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3);
*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0,
- offsetof(struct htab_elem, key) + map->key_size);
+ offsetof(struct htab_elem, key) + roundup(map->key_size, 8));
*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
*insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a05aeb345896..a0bf39b7359a 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -129,7 +129,8 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = {
BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
}
@@ -882,6 +883,13 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
if (fmt[i] == 'p') {
sizeof_cur_arg = sizeof(long);
+ if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
+ ispunct(fmt[i + 1])) {
+ if (tmp_buf)
+ cur_arg = raw_args[num_spec];
+ goto nocopy_fmt;
+ }
+
if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') &&
fmt[i + 2] == 's') {
fmt_ptype = fmt[i + 1];
@@ -889,11 +897,9 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
goto fmt_str;
}
- if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
- ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' ||
+ if (fmt[i + 1] == 'K' ||
fmt[i + 1] == 'x' || fmt[i + 1] == 's' ||
fmt[i + 1] == 'S') {
- /* just kernel pointers */
if (tmp_buf)
cur_arg = raw_args[num_spec];
i++;
@@ -1270,8 +1276,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
goto out;
}
- /* allocate hrtimer via map_kmalloc to use memcg accounting */
- cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
+ /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until
+ * kmalloc_nolock() is available, avoid locking issues by using
+ * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM).
+ */
+ cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);
if (!cb) {
ret = -ENOMEM;
goto out;
@@ -2775,9 +2784,16 @@ static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp)
struct bpf_throw_ctx *ctx = cookie;
struct bpf_prog *prog;
- if (!is_bpf_text_address(ip))
- return !ctx->cnt;
+ /*
+ * The RCU read lock is held to safely traverse the latch tree, but we
+ * don't need its protection when accessing the prog, since it has an
+ * active stack frame on the current stack trace, and won't disappear.
+ */
+ rcu_read_lock();
prog = bpf_prog_ksym_find(ip);
+ rcu_read_unlock();
+ if (!prog)
+ return !ctx->cnt;
ctx->cnt++;
if (bpf_is_subprog(prog))
return true;
diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig
index c9d45c9d6918..f9b11d01c3b5 100644
--- a/kernel/bpf/preload/Kconfig
+++ b/kernel/bpf/preload/Kconfig
@@ -10,7 +10,6 @@ menuconfig BPF_PRELOAD
# The dependency on !COMPILE_TEST prevents it from being enabled
# in allmodconfig or allyesconfig configurations
depends on !COMPILE_TEST
- select USERMODE_DRIVER
help
This builds kernel module with several embedded BPF programs that are
pinned into BPF FS mount point as human readable files that are
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index 0c63bc2cd895..fdad0eb308fe 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -89,4 +89,5 @@ static void __exit fini(void)
}
late_initcall(load);
module_exit(fini);
+MODULE_IMPORT_NS(BPF_INTERNAL);
MODULE_LICENSE("GPL");
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 696e5a2cbea2..ba4543e771a6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -35,6 +35,7 @@
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
#include <linux/trace_events.h>
+#include <linux/cookie.h>
#include <net/netfilter/nf_bpf_link.h>
#include <net/netkit.h>
@@ -51,6 +52,7 @@
#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)
DEFINE_PER_CPU(int, bpf_prog_active);
+DEFINE_COOKIE(bpf_map_cookie);
static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
static DEFINE_IDR(map_idr);
@@ -765,6 +767,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
security_bpf_map_free(map);
bpf_map_release_memcg(map);
+ bpf_map_owner_free(map);
bpf_map_free(map);
}
@@ -859,12 +862,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
struct bpf_map *map = filp->private_data;
u32 type = 0, jited = 0;
- if (map_type_contains_progs(map)) {
- spin_lock(&map->owner.lock);
- type = map->owner.type;
- jited = map->owner.jited;
- spin_unlock(&map->owner.lock);
+ spin_lock(&map->owner_lock);
+ if (map->owner) {
+ type = map->owner->type;
+ jited = map->owner->jited;
}
+ spin_unlock(&map->owner_lock);
seq_printf(m,
"map_type:\t%u\n"
@@ -1360,10 +1363,14 @@ static int map_create(union bpf_attr *attr)
if (err < 0)
goto free_map;
+ preempt_disable();
+ map->cookie = gen_cookie_next(&bpf_map_cookie);
+ preempt_enable();
+
atomic64_set(&map->refcnt, 1);
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
- spin_lock_init(&map->owner.lock);
+ spin_lock_init(&map->owner_lock);
if (attr->btf_key_type_id || attr->btf_value_type_id ||
/* Even the map's value is a kernel's struct,
@@ -1457,7 +1464,7 @@ struct bpf_map *bpf_map_get(u32 ufd)
return map;
}
-EXPORT_SYMBOL(bpf_map_get);
+EXPORT_SYMBOL_NS(bpf_map_get, BPF_INTERNAL);
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
@@ -3223,7 +3230,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
bpf_link_inc(link);
return link;
}
-EXPORT_SYMBOL(bpf_link_get_from_fd);
+EXPORT_SYMBOL_NS(bpf_link_get_from_fd, BPF_INTERNAL);
static void bpf_tracing_link_release(struct bpf_link *link)
{
@@ -4042,7 +4049,8 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
#define BPF_F_ATTACH_MASK_BASE \
(BPF_F_ALLOW_OVERRIDE | \
BPF_F_ALLOW_MULTI | \
- BPF_F_REPLACE)
+ BPF_F_REPLACE | \
+ BPF_F_PREORDER)
#define BPF_F_ATTACH_MASK_MPROG \
(BPF_F_REPLACE | \
@@ -4606,6 +4614,8 @@ static int bpf_prog_get_info_by_fd(struct file *file,
info.recursion_misses = stats.misses;
info.verified_insns = prog->aux->verified_insns;
+ if (prog->aux->btf)
+ info.btf_id = btf_obj_id(prog->aux->btf);
if (!bpf_capable()) {
info.jited_prog_len = 0;
@@ -4752,8 +4762,6 @@ static int bpf_prog_get_info_by_fd(struct file *file,
}
}
- if (prog->aux->btf)
- info.btf_id = btf_obj_id(prog->aux->btf);
info.attach_btf_id = prog->aux->attach_btf_id;
if (attach_btf)
info.attach_btf_obj_id = btf_obj_id(attach_btf);
@@ -5853,7 +5861,7 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
return ____bpf_sys_bpf(cmd, attr, size);
}
}
-EXPORT_SYMBOL(kern_sys_bpf);
+EXPORT_SYMBOL_NS(kern_sys_bpf, BPF_INTERNAL);
static const struct bpf_func_proto bpf_sys_bpf_proto = {
.func = bpf_sys_bpf,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9000806ee3ba..24ae8f33e5d7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -400,7 +400,8 @@ static bool reg_not_null(const struct bpf_reg_state *reg)
type == PTR_TO_MAP_KEY ||
type == PTR_TO_SOCK_COMMON ||
(type == PTR_TO_BTF_ID && is_trusted_reg(reg)) ||
- type == PTR_TO_MEM;
+ type == PTR_TO_MEM ||
+ type == CONST_PTR_TO_MAP;
}
static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
@@ -1376,13 +1377,6 @@ static void free_func_state(struct bpf_func_state *state)
kfree(state);
}
-static void clear_jmp_history(struct bpf_verifier_state *state)
-{
- kfree(state->jmp_history);
- state->jmp_history = NULL;
- state->jmp_history_cnt = 0;
-}
-
static void free_verifier_state(struct bpf_verifier_state *state,
bool free_self)
{
@@ -1392,7 +1386,6 @@ static void free_verifier_state(struct bpf_verifier_state *state,
free_func_state(state->frame[i]);
state->frame[i] = NULL;
}
- clear_jmp_history(state);
if (free_self)
kfree(state);
}
@@ -1418,13 +1411,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
struct bpf_func_state *dst;
int i, err;
- dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
- src->jmp_history_cnt, sizeof(*dst_state->jmp_history),
- GFP_USER);
- if (!dst_state->jmp_history)
- return -ENOMEM;
- dst_state->jmp_history_cnt = src->jmp_history_cnt;
-
/* if dst has more stack frames then src frame, free them, this is also
* necessary in case of exceptional exits using bpf_throw.
*/
@@ -1443,10 +1429,13 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
dst_state->parent = src->parent;
dst_state->first_insn_idx = src->first_insn_idx;
dst_state->last_insn_idx = src->last_insn_idx;
+ dst_state->insn_hist_start = src->insn_hist_start;
+ dst_state->insn_hist_end = src->insn_hist_end;
dst_state->dfs_depth = src->dfs_depth;
dst_state->callback_unroll_depth = src->callback_unroll_depth;
dst_state->used_as_loop_entry = src->used_as_loop_entry;
dst_state->may_goto_depth = src->may_goto_depth;
+ dst_state->loop_entry = src->loop_entry;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
@@ -2495,9 +2484,14 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
* The caller state doesn't matter.
* This is async callback. It starts in a fresh stack.
* Initialize it similar to do_check_common().
+ * But we do need to make sure to not clobber insn_hist, so we keep
+ * chaining insn_hist_start/insn_hist_end indices as for a normal
+ * child state.
*/
elem->st.branches = 1;
elem->st.in_sleepable = is_sleepable;
+ elem->st.insn_hist_start = env->cur_state->insn_hist_end;
+ elem->st.insn_hist_end = elem->st.insn_hist_start;
frame = kzalloc(sizeof(*frame), GFP_KERNEL);
if (!frame)
goto err;
@@ -2528,16 +2522,36 @@ static int cmp_subprogs(const void *a, const void *b)
((struct bpf_subprog_info *)b)->start;
}
+/* Find subprogram that contains instruction at 'off' */
+static struct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *vals = env->subprog_info;
+ int l, r, m;
+
+ if (off >= env->prog->len || off < 0 || env->subprog_cnt == 0)
+ return NULL;
+
+ l = 0;
+ r = env->subprog_cnt - 1;
+ while (l < r) {
+ m = l + (r - l + 1) / 2;
+ if (vals[m].start <= off)
+ l = m;
+ else
+ r = m - 1;
+ }
+ return &vals[l];
+}
+
+/* Find subprogram that starts exactly at 'off' */
static int find_subprog(struct bpf_verifier_env *env, int off)
{
struct bpf_subprog_info *p;
- p = bsearch(&off, env->subprog_info, env->subprog_cnt,
- sizeof(env->subprog_info[0]), cmp_subprogs);
- if (!p)
+ p = find_containing_subprog(env, off);
+ if (!p || p->start != off)
return -ENOENT;
return p - env->subprog_info;
-
}
static int add_subprog(struct bpf_verifier_env *env, int off)
@@ -2967,6 +2981,21 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
return res ? &res->func_model : NULL;
}
+static int add_kfunc_in_insns(struct bpf_verifier_env *env,
+ struct bpf_insn *insn, int cnt)
+{
+ int i, ret;
+
+ for (i = 0; i < cnt; i++, insn++) {
+ if (bpf_pseudo_kfunc_call(insn)) {
+ ret = add_kfunc_call(env, insn->imm, insn->off);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
{
struct bpf_subprog_info *subprog = env->subprog_info;
@@ -3477,11 +3506,10 @@ static void linked_regs_unpack(u64 val, struct linked_regs *s)
}
/* for any branch, call, exit record the history of jmps in the given state */
-static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
- int insn_flags, u64 linked_regs)
+static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
+ int insn_flags, u64 linked_regs)
{
- u32 cnt = cur->jmp_history_cnt;
- struct bpf_jmp_history_entry *p;
+ struct bpf_insn_hist_entry *p;
size_t alloc_size;
/* combine instruction flags if we already recorded this instruction */
@@ -3501,29 +3529,32 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st
return 0;
}
- cnt++;
- alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
- p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
- if (!p)
- return -ENOMEM;
- cur->jmp_history = p;
+ if (cur->insn_hist_end + 1 > env->insn_hist_cap) {
+ alloc_size = size_mul(cur->insn_hist_end + 1, sizeof(*p));
+ p = kvrealloc(env->insn_hist, alloc_size, GFP_USER);
+ if (!p)
+ return -ENOMEM;
+ env->insn_hist = p;
+ env->insn_hist_cap = alloc_size / sizeof(*p);
+ }
- p = &cur->jmp_history[cnt - 1];
+ p = &env->insn_hist[cur->insn_hist_end];
p->idx = env->insn_idx;
p->prev_idx = env->prev_insn_idx;
p->flags = insn_flags;
p->linked_regs = linked_regs;
- cur->jmp_history_cnt = cnt;
+
+ cur->insn_hist_end++;
env->cur_hist_ent = p;
return 0;
}
-static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
- u32 hist_end, int insn_idx)
+static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *env,
+ u32 hist_start, u32 hist_end, int insn_idx)
{
- if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
- return &st->jmp_history[hist_end - 1];
+ if (hist_end > hist_start && env->insn_hist[hist_end - 1].idx == insn_idx)
+ return &env->insn_hist[hist_end - 1];
return NULL;
}
@@ -3540,25 +3571,26 @@ static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_stat
* history entry recording a jump from last instruction of parent state and
* first instruction of given state.
*/
-static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
- u32 *history)
+static int get_prev_insn_idx(const struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ int insn_idx, u32 hist_start, u32 *hist_endp)
{
- u32 cnt = *history;
+ u32 hist_end = *hist_endp;
+ u32 cnt = hist_end - hist_start;
- if (i == st->first_insn_idx) {
+ if (insn_idx == st->first_insn_idx) {
if (cnt == 0)
return -ENOENT;
- if (cnt == 1 && st->jmp_history[0].idx == i)
+ if (cnt == 1 && env->insn_hist[hist_start].idx == insn_idx)
return -ENOENT;
}
- if (cnt && st->jmp_history[cnt - 1].idx == i) {
- i = st->jmp_history[cnt - 1].prev_idx;
- (*history)--;
+ if (cnt && env->insn_hist[hist_end - 1].idx == insn_idx) {
+ (*hist_endp)--;
+ return env->insn_hist[hist_end - 1].prev_idx;
} else {
- i--;
+ return insn_idx - 1;
}
- return i;
}
static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
@@ -3730,7 +3762,7 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
/* If any register R in hist->linked_regs is marked as precise in bt,
* do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs.
*/
-static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist)
+static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist_entry *hist)
{
struct linked_regs linked_regs;
bool some_precise = false;
@@ -3775,7 +3807,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
* - *was* processed previously during backtracking.
*/
static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
- struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
+ struct bpf_insn_hist_entry *hist, struct backtrack_state *bt)
{
const struct bpf_insn_cbs cbs = {
.cb_call = disasm_kfunc_name,
@@ -4035,8 +4067,10 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* before it would be equally necessary to
* propagate it to dreg.
*/
- bt_set_reg(bt, dreg);
- bt_set_reg(bt, sreg);
+ if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK))
+ bt_set_reg(bt, sreg);
+ if (!hist || !(hist->flags & INSN_F_DST_REG_STACK))
+ bt_set_reg(bt, dreg);
} else if (BPF_SRC(insn->code) == BPF_K) {
/* dreg <cond> K
* Only dreg still needs precision before
@@ -4194,7 +4228,7 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_
* SCALARS, as well as any other registers and slots that contribute to
* a tracked state of given registers/stack slots, depending on specific BPF
* assembly instructions (see backtrack_insns() for exact instruction handling
- * logic). This backtracking relies on recorded jmp_history and is able to
+ * logic). This backtracking relies on recorded insn_hist and is able to
* traverse entire chain of parent states. This process ends only when all the
* necessary registers/slots and their transitive dependencies are marked as
* precise.
@@ -4311,8 +4345,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
for (;;) {
DECLARE_BITMAP(mask, 64);
- u32 history = st->jmp_history_cnt;
- struct bpf_jmp_history_entry *hist;
+ u32 hist_start = st->insn_hist_start;
+ u32 hist_end = st->insn_hist_end;
+ struct bpf_insn_hist_entry *hist;
if (env->log.level & BPF_LOG_LEVEL2) {
verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
@@ -4351,7 +4386,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
err = 0;
skip_first = false;
} else {
- hist = get_jmp_hist_entry(st, history, i);
+ hist = get_insn_hist_entry(env, hist_start, hist_end, i);
err = backtrack_insn(env, i, subseq_idx, hist, bt);
}
if (err == -ENOTSUPP) {
@@ -4368,7 +4403,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
*/
return 0;
subseq_idx = i;
- i = get_prev_insn_idx(st, i, &history);
+ i = get_prev_insn_idx(env, st, i, hist_start, &hist_end);
if (i == -ENOENT)
break;
if (i >= env->prog->len) {
@@ -4735,7 +4770,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
}
if (insn_flags)
- return push_jmp_history(env, env->cur_state, insn_flags, 0);
+ return push_insn_history(env, env->cur_state, insn_flags, 0);
return 0;
}
@@ -5042,7 +5077,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
insn_flags = 0; /* we are not restoring spilled register */
}
if (insn_flags)
- return push_jmp_history(env, env->cur_state, insn_flags, 0);
+ return push_insn_history(env, env->cur_state, insn_flags, 0);
return 0;
}
@@ -9811,6 +9846,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
subprog, sub_name);
+ if (env->subprog_info[subprog].changes_pkt_data)
+ clear_all_pkt_pointers(env);
/* mark global subprog for verifying after main prog */
subprog_aux(env, subprog)->called = true;
clear_caller_saved_regs(env, caller->regs);
@@ -15016,6 +15053,10 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state
if (!is_reg_const(reg2, is_jmp32))
break;
val = reg_const_value(reg2, is_jmp32);
+ /* Forget the ranges before narrowing tnums, to avoid invariant
+ * violations if we're on a dead branch.
+ */
+ __mark_reg_unbounded(reg1);
if (is_jmp32) {
t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val));
reg1->var_off = tnum_with_subreg(reg1->var_off, t);
@@ -15381,6 +15422,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_reg_state *eq_branch_regs;
struct linked_regs linked_regs = {};
u8 opcode = BPF_OP(insn->code);
+ int insn_flags = 0;
bool is_jmp32;
int pred = -1;
int err;
@@ -15440,6 +15482,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
insn->src_reg);
return -EACCES;
}
+
+ if (src_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_SRC_REG_STACK;
+ if (dst_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_DST_REG_STACK;
} else {
if (insn->src_reg != BPF_REG_0) {
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
@@ -15449,6 +15496,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
memset(src_reg, 0, sizeof(*src_reg));
src_reg->type = SCALAR_VALUE;
__mark_reg_known(src_reg, insn->imm);
+
+ if (dst_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_DST_REG_STACK;
+ }
+
+ if (insn_flags) {
+ err = push_insn_history(env, this_branch, insn_flags, 0);
+ if (err)
+ return err;
}
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
@@ -15504,7 +15560,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (dst_reg->type == SCALAR_VALUE && dst_reg->id)
collect_linked_regs(this_branch, dst_reg->id, &linked_regs);
if (linked_regs.cnt > 1) {
- err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
+ err = push_insn_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
if (err)
return err;
}
@@ -16001,6 +16057,29 @@ enforce_retval:
return 0;
}
+static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *subprog;
+
+ subprog = find_containing_subprog(env, off);
+ subprog->changes_pkt_data = true;
+}
+
+/* 't' is an index of a call-site.
+ * 'w' is a callee entry point.
+ * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED.
+ * Rely on DFS traversal order and absence of recursive calls to guarantee that
+ * callee's change_pkt_data marks would be correct at that moment.
+ */
+static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w)
+{
+ struct bpf_subprog_info *caller, *callee;
+
+ caller = find_containing_subprog(env, t);
+ callee = find_containing_subprog(env, w);
+ caller->changes_pkt_data |= callee->changes_pkt_data;
+}
+
/* non-recursive DFS pseudo code
* 1 procedure DFS-iterative(G,v):
* 2 label v as discovered
@@ -16134,6 +16213,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
bool visit_callee)
{
int ret, insn_sz;
+ int w;
insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
@@ -16145,8 +16225,10 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
mark_jmp_point(env, t + insn_sz);
if (visit_callee) {
+ w = t + insns[t].imm + 1;
mark_prune_point(env, t);
- ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
+ merge_callee_effects(env, t, w);
+ ret = push_insn(t, w, BRANCH, env);
}
return ret;
}
@@ -16466,6 +16548,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
mark_prune_point(env, t);
mark_jmp_point(env, t);
}
+ if (bpf_helper_call(insn) && bpf_helper_changes_pkt_data(insn->imm))
+ mark_subprog_changes_pkt_data(env, t);
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
struct bpf_kfunc_call_arg_meta meta;
@@ -16600,6 +16684,7 @@ walk_cfg:
}
}
ret = 0; /* cfg looks good */
+ env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data;
err_free:
kvfree(insn_state);
@@ -17216,12 +17301,16 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
static void clean_live_states(struct bpf_verifier_env *env, int insn,
struct bpf_verifier_state *cur)
{
+ struct bpf_verifier_state *loop_entry;
struct bpf_verifier_state_list *sl;
sl = *explored_state(env, insn);
while (sl) {
if (sl->state.branches)
goto next;
+ loop_entry = get_loop_entry(&sl->state);
+ if (loop_entry && loop_entry->branches)
+ goto next;
if (sl->state.insn_idx != insn ||
!same_callsites(&sl->state, cur))
goto next;
@@ -17913,7 +18002,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
/* Avoid accumulating infinitely long jmp history */
- cur->jmp_history_cnt > 40;
+ cur->insn_hist_end - cur->insn_hist_start > 40;
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
@@ -18111,7 +18200,7 @@ hit:
* the current state.
*/
if (is_jmp_point(env, env->insn_idx))
- err = err ? : push_jmp_history(env, cur, 0, 0);
+ err = err ? : push_insn_history(env, cur, 0, 0);
err = err ? : propagate_precision(env, &sl->state);
if (err)
return err;
@@ -18210,8 +18299,8 @@ next:
cur->parent = new;
cur->first_insn_idx = insn_idx;
+ cur->insn_hist_start = cur->insn_hist_end;
cur->dfs_depth = new->dfs_depth + 1;
- clear_jmp_history(cur);
new_sl->next = *explored_state(env, insn_idx);
*explored_state(env, insn_idx) = new_sl;
/* connect new state to parentage chain. Current frame needs all
@@ -18379,7 +18468,7 @@ static int do_check(struct bpf_verifier_env *env)
}
if (is_jmp_point(env, env->insn_idx)) {
- err = push_jmp_history(env, state, 0, 0);
+ err = push_insn_history(env, state, 0, 0);
if (err)
return err;
}
@@ -18650,6 +18739,10 @@ process_bpf_exit:
return err;
break;
} else {
+ if (WARN_ON_ONCE(env->cur_state->loop_entry)) {
+ verbose(env, "verifier bug: env->cur_state->loop_entry != NULL\n");
+ return -EFAULT;
+ }
do_print_state = true;
continue;
}
@@ -19717,7 +19810,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
struct bpf_subprog_info *subprogs = env->subprog_info;
const struct bpf_verifier_ops *ops = env->ops;
- int i, cnt, size, ctx_field_size, delta = 0, epilogue_cnt = 0;
+ int i, cnt, size, ctx_field_size, ret, delta = 0, epilogue_cnt = 0;
const int insn_cnt = env->prog->len;
struct bpf_insn *epilogue_buf = env->epilogue_buf;
struct bpf_insn *insn_buf = env->insn_buf;
@@ -19746,6 +19839,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
return -ENOMEM;
env->prog = new_prog;
delta += cnt - 1;
+
+ ret = add_kfunc_in_insns(env, epilogue_buf, epilogue_cnt - 1);
+ if (ret < 0)
+ return ret;
}
}
@@ -19766,6 +19863,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
env->prog = new_prog;
delta += cnt - 1;
+
+ ret = add_kfunc_in_insns(env, insn_buf, cnt - 1);
+ if (ret < 0)
+ return ret;
}
}
@@ -20102,6 +20203,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->num_exentries = num_exentries;
func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
+ func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data;
if (!i)
func[i]->aux->exception_boundary = env->seen_exception;
func[i] = bpf_int_jit_compile(func[i]);
@@ -21938,6 +22040,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
}
if (tgt_prog) {
struct bpf_prog_aux *aux = tgt_prog->aux;
+ bool tgt_changes_pkt_data;
if (bpf_prog_is_dev_bound(prog->aux) &&
!bpf_prog_dev_bound_match(prog, tgt_prog)) {
@@ -21972,6 +22075,14 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
"Extension programs should be JITed\n");
return -EINVAL;
}
+ tgt_changes_pkt_data = aux->func
+ ? aux->func[subprog]->aux->changes_pkt_data
+ : aux->changes_pkt_data;
+ if (prog->aux->changes_pkt_data && !tgt_changes_pkt_data) {
+ bpf_log(log,
+ "Extension program changes packet data, while original does not\n");
+ return -EINVAL;
+ }
}
if (!tgt_prog->jited) {
bpf_log(log, "Can attach to only JITed progs\n");
@@ -22228,6 +22339,33 @@ BTF_ID(func, __rcu_read_unlock)
#endif
BTF_SET_END(btf_id_deny)
+/* fexit and fmod_ret can't be used to attach to __noreturn functions.
+ * Currently, we must manually list all __noreturn functions here. Once a more
+ * robust solution is implemented, this workaround can be removed.
+ */
+BTF_SET_START(noreturn_deny)
+#ifdef CONFIG_IA32_EMULATION
+BTF_ID(func, __ia32_sys_exit)
+BTF_ID(func, __ia32_sys_exit_group)
+#endif
+#ifdef CONFIG_KUNIT
+BTF_ID(func, __kunit_abort)
+BTF_ID(func, kunit_try_catch_throw)
+#endif
+#ifdef CONFIG_MODULES
+BTF_ID(func, __module_put_and_kthread_exit)
+#endif
+#ifdef CONFIG_X86_64
+BTF_ID(func, __x64_sys_exit)
+BTF_ID(func, __x64_sys_exit_group)
+#endif
+BTF_ID(func, do_exit)
+BTF_ID(func, do_group_exit)
+BTF_ID(func, kthread_complete_and_exit)
+BTF_ID(func, kthread_exit)
+BTF_ID(func, make_task_dead)
+BTF_SET_END(noreturn_deny)
+
static bool can_be_sleepable(struct bpf_prog *prog)
{
if (prog->type == BPF_PROG_TYPE_TRACING) {
@@ -22316,6 +22454,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
} else if (prog->type == BPF_PROG_TYPE_TRACING &&
btf_id_set_contains(&btf_id_deny, btf_id)) {
return -EINVAL;
+ } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
+ prog->expected_attach_type == BPF_MODIFY_RETURN) &&
+ btf_id_set_contains(&noreturn_deny, btf_id)) {
+ verbose(env, "Attaching fexit/fmod_ret to __noreturn functions is rejected.\n");
+ return -EINVAL;
}
key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
@@ -22437,10 +22580,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret < 0)
goto skip_full_check;
- ret = check_attach_btf_id(env);
- if (ret)
- goto skip_full_check;
-
ret = resolve_pseudo_ldimm64(env);
if (ret < 0)
goto skip_full_check;
@@ -22455,6 +22594,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret < 0)
goto skip_full_check;
+ ret = check_attach_btf_id(env);
+ if (ret)
+ goto skip_full_check;
+
ret = mark_fastcall_patterns(env);
if (ret < 0)
goto skip_full_check;
@@ -22591,6 +22734,7 @@ err_unlock:
if (!is_priv)
mutex_unlock(&bpf_verifier_lock);
vfree(env->insn_aux_data);
+ kvfree(env->insn_hist);
err_free_env:
kvfree(env);
return ret;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4378f3eff25d..5fc2801ee921 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -90,7 +90,7 @@
DEFINE_MUTEX(cgroup_mutex);
DEFINE_SPINLOCK(css_set_lock);
-#ifdef CONFIG_PROVE_RCU
+#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
#endif
@@ -123,8 +123,31 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
* of concurrent destructions. Use a separate workqueue so that cgroup
* destruction work items don't end up filling up max_active of system_wq
* which may lead to deadlock.
+ *
+ * A cgroup destruction should enqueue work sequentially to:
+ * cgroup_offline_wq: use for css offline work
+ * cgroup_release_wq: use for css release work
+ * cgroup_free_wq: use for free work
+ *
+ * Rationale for using separate workqueues:
+ * The cgroup root free work may depend on completion of other css offline
+ * operations. If all tasks were enqueued to a single workqueue, this could
+ * create a deadlock scenario where:
+ * - Free work waits for other css offline work to complete.
+ * - But other css offline work is queued after free work in the same queue.
+ *
+ * Example deadlock scenario with single workqueue (cgroup_destroy_wq):
+ * 1. umount net_prio
+ * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx)
+ * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx)
+ * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline.
+ * 5. net_prio root destruction blocks waiting for perf_event CSS A offline,
+ * which can never complete as it's behind in the same queue and
+ * workqueue's max_active is 1.
*/
-static struct workqueue_struct *cgroup_destroy_wq;
+static struct workqueue_struct *cgroup_offline_wq;
+static struct workqueue_struct *cgroup_release_wq;
+static struct workqueue_struct *cgroup_free_wq;
/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
@@ -2339,9 +2362,37 @@ static struct file_system_type cgroup2_fs_type = {
};
#ifdef CONFIG_CPUSETS_V1
+enum cpuset_param {
+ Opt_cpuset_v2_mode,
+};
+
+static const struct fs_parameter_spec cpuset_fs_parameters[] = {
+ fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
+ {}
+};
+
+static int cpuset_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, cpuset_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_cpuset_v2_mode:
+ ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
+ return 0;
+ }
+ return -EINVAL;
+}
+
static const struct fs_context_operations cpuset_fs_context_ops = {
.get_tree = cgroup1_get_tree,
.free = cgroup_fs_context_free,
+ .parse_param = cpuset_parse_param,
};
/*
@@ -2378,6 +2429,7 @@ static int cpuset_init_fs_context(struct fs_context *fc)
static struct file_system_type cpuset_fs_type = {
.name = "cpuset",
.init_fs_context = cpuset_init_fs_context,
+ .parameters = cpuset_fs_parameters,
.fs_flags = FS_USERNS_MOUNT,
};
#endif
@@ -5512,7 +5564,7 @@ static void css_release_work_fn(struct work_struct *work)
cgroup_unlock();
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
- queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
+ queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
}
static void css_release(struct percpu_ref *ref)
@@ -5521,7 +5573,7 @@ static void css_release(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
INIT_WORK(&css->destroy_work, css_release_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ queue_work(cgroup_release_wq, &css->destroy_work);
}
static void init_and_link_css(struct cgroup_subsys_state *css,
@@ -5656,7 +5708,7 @@ err_list_del:
err_free_css:
list_del_rcu(&css->rstat_css_node);
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
- queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
+ queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
return ERR_PTR(err);
}
@@ -5889,7 +5941,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
if (atomic_dec_and_test(&css->online_cnt)) {
INIT_WORK(&css->destroy_work, css_killed_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ queue_work(cgroup_offline_wq, &css->destroy_work);
}
}
@@ -6267,8 +6319,14 @@ static int __init cgroup_wq_init(void)
* We would prefer to do this in cgroup_init() above, but that
* is called before init_workqueues(): so leave this until after.
*/
- cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
- BUG_ON(!cgroup_destroy_wq);
+ cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1);
+ BUG_ON(!cgroup_offline_wq);
+
+ cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1);
+ BUG_ON(!cgroup_release_wq);
+
+ cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1);
+ BUG_ON(!cgroup_free_wq);
return 0;
}
core_initcall(cgroup_wq_init);
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index 976a8bc3ff60..383963e28ac6 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -33,6 +33,7 @@ enum prs_errcode {
PERR_CPUSEMPTY,
PERR_HKEEPING,
PERR_ACCESS,
+ PERR_REMOTE,
};
/* bits in struct cpuset flags field */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 839f88ba17f7..25f9565f798d 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -62,6 +62,7 @@ static const char * const perr_strings[] = {
[PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty",
[PERR_HKEEPING] = "partition config conflicts with housekeeping setup",
[PERR_ACCESS] = "Enable partition not permitted",
+ [PERR_REMOTE] = "Have remote partition underneath",
};
/*
@@ -266,7 +267,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes)
{
if (!cpusets_insane_config() &&
movable_only_nodes(nodes)) {
- static_branch_enable(&cpusets_insane_config_key);
+ static_branch_enable_cpuslocked(&cpusets_insane_config_key);
pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
"Cpuset allocations might fail even with a lot of memory available.\n",
nodemask_pr_args(nodes));
@@ -1099,9 +1100,11 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
if (top_cs) {
/*
- * Percpu kthreads in top_cpuset are ignored
+ * PF_NO_SETAFFINITY tasks are ignored.
+ * All per cpu kthreads should have PF_NO_SETAFFINITY
+ * flag set, see kthread_set_per_cpu().
*/
- if (kthread_is_per_cpu(task))
+ if (task->flags & PF_NO_SETAFFINITY)
continue;
cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
} else {
@@ -1768,7 +1771,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if (is_partition_valid(cs))
adding = cpumask_and(tmp->addmask,
xcpus, parent->effective_xcpus);
- } else if (is_partition_invalid(cs) &&
+ } else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) &&
cpumask_subset(xcpus, parent->effective_xcpus)) {
struct cgroup_subsys_state *css;
struct cpuset *child;
@@ -2825,6 +2828,19 @@ static int update_prstate(struct cpuset *cs, int new_prs)
}
/*
+ * We don't support the creation of a new local partition with
+ * a remote partition underneath it. This unsupported
+ * setting can happen only if parent is the top_cpuset because
+ * a remote partition cannot be created underneath an existing
+ * local or remote partition.
+ */
+ if ((parent == &top_cpuset) &&
+ cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) {
+ err = PERR_REMOTE;
+ goto out;
+ }
+
+ /*
* If parent is valid partition, enable local partiion.
* Otherwise, enable a remote partition.
*/
@@ -3776,9 +3792,10 @@ retry:
partcmd = partcmd_invalidate;
/*
* On the other hand, an invalid partition root may be transitioned
- * back to a regular one.
+ * back to a regular one with a non-empty effective xcpus.
*/
- else if (is_partition_valid(parent) && is_partition_invalid(cs))
+ else if (is_partition_valid(parent) && is_partition_invalid(cs) &&
+ !cpumask_empty(cs->effective_xcpus))
partcmd = partcmd_update;
if (partcmd >= 0) {
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 074653f964c1..c37888b7d25a 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -66,15 +66,9 @@ static struct freezer *parent_freezer(struct freezer *freezer)
bool cgroup_freezing(struct task_struct *task)
{
bool ret;
- unsigned int state;
rcu_read_lock();
- /* Check if the cgroup is still FREEZING, but not FROZEN. The extra
- * !FROZEN check is required, because the FREEZING bit is not cleared
- * when the state FROZEN is reached.
- */
- state = task_freezer(task)->state;
- ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN);
+ ret = task_freezer(task)->state & CGROUP_FREEZING;
rcu_read_unlock();
return ret;
@@ -188,13 +182,12 @@ static void freezer_attach(struct cgroup_taskset *tset)
if (!(freezer->state & CGROUP_FREEZING)) {
__thaw_task(task);
} else {
- freeze_task(task);
-
/* clear FROZEN and propagate upwards */
while (freezer && (freezer->state & CGROUP_FROZEN)) {
freezer->state &= ~CGROUP_FROZEN;
freezer = parent_freezer(freezer);
}
+ freeze_task(task);
}
}
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 3e01781aeb7b..c4ce2f5a9745 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -323,13 +323,11 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
rcu_read_unlock();
}
- /* play nice and yield if necessary */
- if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
- __cgroup_rstat_unlock(cgrp, cpu);
- if (!cond_resched())
- cpu_relax();
- __cgroup_rstat_lock(cgrp, cpu);
- }
+ /* play nice and avoid disabling interrupts for a long time */
+ __cgroup_rstat_unlock(cgrp, cpu);
+ if (!cond_resched())
+ cpu_relax();
+ __cgroup_rstat_lock(cgrp, cpu);
}
}
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 055da410ac71..9e5d63efe7c5 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -64,8 +64,7 @@ struct cma *dma_contiguous_default_area;
* Users, who want to set the size of global CMA area for their system
* should use cma= kernel parameter.
*/
-static const phys_addr_t size_bytes __initconst =
- (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M;
+#define size_bytes ((phys_addr_t)CMA_SIZE_MBYTES * SZ_1M)
static phys_addr_t size_cmdline __initdata = -1;
static phys_addr_t base_cmdline __initdata;
static phys_addr_t limit_cmdline __initdata;
@@ -481,8 +480,6 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
pr_err("Reserved memory: unable to setup CMA region\n");
return err;
}
- /* Architecture specific contiguous memory fixup. */
- dma_contiguous_early_fixup(rmem->base, rmem->size);
if (default_cma)
dma_contiguous_default_area = cma;
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index f6f0387761d0..39972e834e7a 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -39,6 +39,7 @@ enum {
dma_debug_sg,
dma_debug_coherent,
dma_debug_resource,
+ dma_debug_noncoherent,
};
enum map_err_types {
@@ -59,8 +60,7 @@ enum map_err_types {
* @direction: enum dma_data_direction
* @sg_call_ents: 'nents' from dma_map_sg
* @sg_mapped_ents: 'mapped_ents' from dma_map_sg
- * @pfn: page frame of the start address
- * @offset: offset of mapping relative to pfn
+ * @paddr: physical start address of the mapping
* @map_err_type: track whether dma_mapping_error() was checked
* @stack_len: number of backtrace entries in @stack_entries
* @stack_entries: stack of backtrace history
@@ -74,8 +74,7 @@ struct dma_debug_entry {
int direction;
int sg_call_ents;
int sg_mapped_ents;
- unsigned long pfn;
- size_t offset;
+ phys_addr_t paddr;
enum map_err_types map_err_type;
#ifdef CONFIG_STACKTRACE
unsigned int stack_len;
@@ -143,6 +142,7 @@ static const char *type2name[] = {
[dma_debug_sg] = "scatter-gather",
[dma_debug_coherent] = "coherent",
[dma_debug_resource] = "resource",
+ [dma_debug_noncoherent] = "noncoherent",
};
static const char *dir2name[] = {
@@ -389,14 +389,6 @@ static void hash_bucket_del(struct dma_debug_entry *entry)
list_del(&entry->list);
}
-static unsigned long long phys_addr(struct dma_debug_entry *entry)
-{
- if (entry->type == dma_debug_resource)
- return __pfn_to_phys(entry->pfn) + entry->offset;
-
- return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset;
-}
-
/*
* For each mapping (initial cacheline in the case of
* dma_alloc_coherent/dma_map_page, initial cacheline in each page of a
@@ -428,8 +420,8 @@ static DEFINE_SPINLOCK(radix_lock);
static phys_addr_t to_cacheline_number(struct dma_debug_entry *entry)
{
- return (entry->pfn << CACHELINE_PER_PAGE_SHIFT) +
- (entry->offset >> L1_CACHE_SHIFT);
+ return ((entry->paddr >> PAGE_SHIFT) << CACHELINE_PER_PAGE_SHIFT) +
+ (offset_in_page(entry->paddr) >> L1_CACHE_SHIFT);
}
static int active_cacheline_read_overlap(phys_addr_t cln)
@@ -538,11 +530,11 @@ void debug_dma_dump_mappings(struct device *dev)
if (!dev || dev == entry->dev) {
cln = to_cacheline_number(entry);
dev_info(entry->dev,
- "%s idx %d P=%llx N=%lx D=%llx L=%llx cln=%pa %s %s\n",
+ "%s idx %d P=%pa D=%llx L=%llx cln=%pa %s %s\n",
type2name[entry->type], idx,
- phys_addr(entry), entry->pfn,
- entry->dev_addr, entry->size,
- &cln, dir2name[entry->direction],
+ &entry->paddr, entry->dev_addr,
+ entry->size, &cln,
+ dir2name[entry->direction],
maperr2str[entry->map_err_type]);
}
}
@@ -569,13 +561,13 @@ static int dump_show(struct seq_file *seq, void *v)
list_for_each_entry(entry, &bucket->list, list) {
cln = to_cacheline_number(entry);
seq_printf(seq,
- "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx cln=%pa %s %s\n",
+ "%s %s %s idx %d P=%pa D=%llx L=%llx cln=%pa %s %s\n",
dev_driver_string(entry->dev),
dev_name(entry->dev),
type2name[entry->type], idx,
- phys_addr(entry), entry->pfn,
- entry->dev_addr, entry->size,
- &cln, dir2name[entry->direction],
+ &entry->paddr, entry->dev_addr,
+ entry->size, &cln,
+ dir2name[entry->direction],
maperr2str[entry->map_err_type]);
}
spin_unlock_irqrestore(&bucket->lock, flags);
@@ -1003,16 +995,17 @@ static void check_unmap(struct dma_debug_entry *ref)
"[mapped as %s] [unmapped as %s]\n",
ref->dev_addr, ref->size,
type2name[entry->type], type2name[ref->type]);
- } else if ((entry->type == dma_debug_coherent) &&
- (phys_addr(ref) != phys_addr(entry))) {
+ } else if ((entry->type == dma_debug_coherent ||
+ entry->type == dma_debug_noncoherent) &&
+ ref->paddr != entry->paddr) {
err_printk(ref->dev, entry, "device driver frees "
"DMA memory with different CPU address "
"[device address=0x%016llx] [size=%llu bytes] "
- "[cpu alloc address=0x%016llx] "
- "[cpu free address=0x%016llx]",
+ "[cpu alloc address=0x%pa] "
+ "[cpu free address=0x%pa]",
ref->dev_addr, ref->size,
- phys_addr(entry),
- phys_addr(ref));
+ &entry->paddr,
+ &ref->paddr);
}
if (ref->sg_call_ents && ref->type == dma_debug_sg &&
@@ -1231,8 +1224,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
entry->dev = dev;
entry->type = dma_debug_single;
- entry->pfn = page_to_pfn(page);
- entry->offset = offset;
+ entry->paddr = page_to_phys(page) + offset;
entry->dev_addr = dma_addr;
entry->size = size;
entry->direction = direction;
@@ -1327,8 +1319,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
entry->type = dma_debug_sg;
entry->dev = dev;
- entry->pfn = page_to_pfn(sg_page(s));
- entry->offset = s->offset;
+ entry->paddr = sg_phys(s);
entry->size = sg_dma_len(s);
entry->dev_addr = sg_dma_address(s);
entry->direction = direction;
@@ -1374,8 +1365,7 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
struct dma_debug_entry ref = {
.type = dma_debug_sg,
.dev = dev,
- .pfn = page_to_pfn(sg_page(s)),
- .offset = s->offset,
+ .paddr = sg_phys(s),
.dev_addr = sg_dma_address(s),
.size = sg_dma_len(s),
.direction = dir,
@@ -1392,6 +1382,18 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
}
}
+static phys_addr_t virt_to_paddr(void *virt)
+{
+ struct page *page;
+
+ if (is_vmalloc_addr(virt))
+ page = vmalloc_to_page(virt);
+ else
+ page = virt_to_page(virt);
+
+ return page_to_phys(page) + offset_in_page(virt);
+}
+
void debug_dma_alloc_coherent(struct device *dev, size_t size,
dma_addr_t dma_addr, void *virt,
unsigned long attrs)
@@ -1414,16 +1416,11 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size,
entry->type = dma_debug_coherent;
entry->dev = dev;
- entry->offset = offset_in_page(virt);
+ entry->paddr = virt_to_paddr(virt);
entry->size = size;
entry->dev_addr = dma_addr;
entry->direction = DMA_BIDIRECTIONAL;
- if (is_vmalloc_addr(virt))
- entry->pfn = vmalloc_to_pfn(virt);
- else
- entry->pfn = page_to_pfn(virt_to_page(virt));
-
add_dma_entry(entry, attrs);
}
@@ -1433,7 +1430,6 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
struct dma_debug_entry ref = {
.type = dma_debug_coherent,
.dev = dev,
- .offset = offset_in_page(virt),
.dev_addr = dma_addr,
.size = size,
.direction = DMA_BIDIRECTIONAL,
@@ -1443,10 +1439,7 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt))
return;
- if (is_vmalloc_addr(virt))
- ref.pfn = vmalloc_to_pfn(virt);
- else
- ref.pfn = page_to_pfn(virt_to_page(virt));
+ ref.paddr = virt_to_paddr(virt);
if (unlikely(dma_debug_disabled()))
return;
@@ -1469,8 +1462,7 @@ void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
entry->type = dma_debug_resource;
entry->dev = dev;
- entry->pfn = PHYS_PFN(addr);
- entry->offset = offset_in_page(addr);
+ entry->paddr = addr;
entry->size = size;
entry->dev_addr = dma_addr;
entry->direction = direction;
@@ -1547,8 +1539,7 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
struct dma_debug_entry ref = {
.type = dma_debug_sg,
.dev = dev,
- .pfn = page_to_pfn(sg_page(s)),
- .offset = s->offset,
+ .paddr = sg_phys(s),
.dev_addr = sg_dma_address(s),
.size = sg_dma_len(s),
.direction = direction,
@@ -1579,8 +1570,7 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
struct dma_debug_entry ref = {
.type = dma_debug_sg,
.dev = dev,
- .pfn = page_to_pfn(sg_page(s)),
- .offset = s->offset,
+ .paddr = sg_phys(sg),
.dev_addr = sg_dma_address(s),
.size = sg_dma_len(s),
.direction = direction,
@@ -1596,6 +1586,49 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
}
}
+void debug_dma_alloc_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr,
+ unsigned long attrs)
+{
+ struct dma_debug_entry *entry;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ entry = dma_entry_alloc();
+ if (!entry)
+ return;
+
+ entry->type = dma_debug_noncoherent;
+ entry->dev = dev;
+ entry->paddr = page_to_phys(page);
+ entry->size = size;
+ entry->dev_addr = dma_addr;
+ entry->direction = direction;
+
+ add_dma_entry(entry, attrs);
+}
+
+void debug_dma_free_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr)
+{
+ struct dma_debug_entry ref = {
+ .type = dma_debug_noncoherent,
+ .dev = dev,
+ .paddr = page_to_phys(page),
+ .dev_addr = dma_addr,
+ .size = size,
+ .direction = direction,
+ };
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ check_unmap(&ref);
+}
+
static int __init dma_debug_driver_setup(char *str)
{
int i;
diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h
index f525197d3cae..48757ca13f31 100644
--- a/kernel/dma/debug.h
+++ b/kernel/dma/debug.h
@@ -54,6 +54,13 @@ extern void debug_dma_sync_sg_for_cpu(struct device *dev,
extern void debug_dma_sync_sg_for_device(struct device *dev,
struct scatterlist *sg,
int nelems, int direction);
+extern void debug_dma_alloc_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr,
+ unsigned long attrs);
+extern void debug_dma_free_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr);
#else /* CONFIG_DMA_API_DEBUG */
static inline void debug_dma_map_page(struct device *dev, struct page *page,
size_t offset, size_t size,
@@ -126,5 +133,18 @@ static inline void debug_dma_sync_sg_for_device(struct device *dev,
int nelems, int direction)
{
}
+
+static inline void debug_dma_alloc_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr,
+ unsigned long attrs)
+{
+}
+
+static inline void debug_dma_free_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr)
+{
+}
#endif /* CONFIG_DMA_API_DEBUG */
#endif /* _KERNEL_DMA_DEBUG_H */
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 864a1121bf08..c7cc4e33ec6e 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -223,6 +223,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
debug_dma_map_sg(dev, sg, nents, ents, dir, attrs);
} else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
ents != -EIO && ents != -EREMOTEIO)) {
+ trace_dma_map_sg_err(dev, sg, nents, ents, dir, attrs);
return -EIO;
}
@@ -604,22 +605,29 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
if (WARN_ON_ONCE(flag & __GFP_COMP))
return NULL;
- if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
+ if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) {
+ trace_dma_alloc(dev, cpu_addr, *dma_handle, size,
+ DMA_BIDIRECTIONAL, flag, attrs);
return cpu_addr;
+ }
/* let the implementation decide on the zone to allocate from: */
flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
- if (dma_alloc_direct(dev, ops))
+ if (dma_alloc_direct(dev, ops)) {
cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
- else if (use_dma_iommu(dev))
+ } else if (use_dma_iommu(dev)) {
cpu_addr = iommu_dma_alloc(dev, size, dma_handle, flag, attrs);
- else if (ops->alloc)
+ } else if (ops->alloc) {
cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
- else
+ } else {
+ trace_dma_alloc(dev, NULL, 0, size, DMA_BIDIRECTIONAL, flag,
+ attrs);
return NULL;
+ }
- trace_dma_alloc(dev, cpu_addr, *dma_handle, size, flag, attrs);
+ trace_dma_alloc(dev, cpu_addr, *dma_handle, size, DMA_BIDIRECTIONAL,
+ flag, attrs);
debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs);
return cpu_addr;
}
@@ -641,10 +649,11 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
*/
WARN_ON(irqs_disabled());
+ trace_dma_free(dev, cpu_addr, dma_handle, size, DMA_BIDIRECTIONAL,
+ attrs);
if (!cpu_addr)
return;
- trace_dma_free(dev, cpu_addr, dma_handle, size, attrs);
debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
if (dma_alloc_direct(dev, ops))
dma_direct_free(dev, size, cpu_addr, dma_handle, attrs);
@@ -683,9 +692,11 @@ struct page *dma_alloc_pages(struct device *dev, size_t size,
struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp);
if (page) {
- trace_dma_map_page(dev, page_to_phys(page), *dma_handle, size,
- dir, 0);
- debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0);
+ trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle,
+ size, dir, gfp, 0);
+ debug_dma_alloc_pages(dev, page, size, dir, *dma_handle, 0);
+ } else {
+ trace_dma_alloc_pages(dev, NULL, 0, size, dir, gfp, 0);
}
return page;
}
@@ -708,8 +719,8 @@ static void __dma_free_pages(struct device *dev, size_t size, struct page *page,
void dma_free_pages(struct device *dev, size_t size, struct page *page,
dma_addr_t dma_handle, enum dma_data_direction dir)
{
- trace_dma_unmap_page(dev, dma_handle, size, dir, 0);
- debug_dma_unmap_page(dev, dma_handle, size, dir);
+ trace_dma_free_pages(dev, page_to_virt(page), dma_handle, size, dir, 0);
+ debug_dma_free_pages(dev, page, size, dir, dma_handle);
__dma_free_pages(dev, size, page, dma_handle, dir);
}
EXPORT_SYMBOL_GPL(dma_free_pages);
@@ -768,8 +779,10 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
if (sgt) {
sgt->nents = 1;
- trace_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs);
+ trace_dma_alloc_sgt(dev, sgt, size, dir, gfp, attrs);
debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs);
+ } else {
+ trace_dma_alloc_sgt_err(dev, NULL, 0, size, dir, gfp, attrs);
}
return sgt;
}
@@ -787,7 +800,7 @@ static void free_single_sgt(struct device *dev, size_t size,
void dma_free_noncontiguous(struct device *dev, size_t size,
struct sg_table *sgt, enum dma_data_direction dir)
{
- trace_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir, 0);
+ trace_dma_free_sgt(dev, sgt, size, dir);
debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir);
if (use_dma_iommu(dev))
@@ -897,6 +910,19 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_coherent_mask);
+static bool __dma_addressing_limited(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
+ dma_get_required_mask(dev))
+ return true;
+
+ if (unlikely(ops) || use_dma_iommu(dev))
+ return false;
+ return !dma_direct_all_ram_mapped(dev);
+}
+
/**
* dma_addressing_limited - return if the device is addressing limited
* @dev: device to check
@@ -907,15 +933,11 @@ EXPORT_SYMBOL(dma_set_coherent_mask);
*/
bool dma_addressing_limited(struct device *dev)
{
- const struct dma_map_ops *ops = get_dma_ops(dev);
-
- if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
- dma_get_required_mask(dev))
- return true;
-
- if (unlikely(ops) || use_dma_iommu(dev))
+ if (!__dma_addressing_limited(dev))
return false;
- return !dma_direct_all_ram_mapped(dev);
+
+ dev_dbg(dev, "device is DMA addressing limited\n");
+ return true;
}
EXPORT_SYMBOL_GPL(dma_addressing_limited);
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 7b04f7575796..ee45dee33d49 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -102,8 +102,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
#ifdef CONFIG_DMA_DIRECT_REMAP
addr = dma_common_contiguous_remap(page, pool_size,
- pgprot_dmacoherent(PAGE_KERNEL),
- __builtin_return_address(0));
+ pgprot_decrypted(pgprot_dmacoherent(PAGE_KERNEL)),
+ __builtin_return_address(0));
if (!addr)
goto free_page;
#else
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 97af53c43608..3cc06ffb60c1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -206,6 +206,19 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
__perf_ctx_unlock(&cpuctx->ctx);
}
+typedef struct {
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+} class_perf_ctx_lock_t;
+
+static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T)
+{ perf_ctx_unlock(_T->cpuctx, _T->ctx); }
+
+static inline class_perf_ctx_lock_t
+class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; }
+
#define TASK_TOMBSTONE ((void *)-1L)
static bool is_kernel_event(struct perf_event *event)
@@ -892,13 +905,19 @@ static void perf_cgroup_switch(struct task_struct *task)
if (READ_ONCE(cpuctx->cgrp) == NULL)
return;
- WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
-
cgrp = perf_cgroup_from_task(task, NULL);
if (READ_ONCE(cpuctx->cgrp) == cgrp)
return;
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx);
+ /*
+ * Re-check, could've raced vs perf_remove_from_context().
+ */
+ if (READ_ONCE(cpuctx->cgrp) == NULL)
+ return;
+
+ WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
+
perf_ctx_disable(&cpuctx->ctx, true);
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
@@ -916,7 +935,6 @@ static void perf_cgroup_switch(struct task_struct *task)
ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
perf_ctx_enable(&cpuctx->ctx, true);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
static int perf_cgroup_ensure_storage(struct perf_event *event,
@@ -1195,6 +1213,12 @@ static void perf_assert_pmu_disabled(struct pmu *pmu)
WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
}
+static inline void perf_pmu_read(struct perf_event *event)
+{
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ event->pmu->read(event);
+}
+
static void get_ctx(struct perf_event_context *ctx)
{
refcount_inc(&ctx->refcount);
@@ -2105,8 +2129,9 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
}
static void put_event(struct perf_event *event);
-static void event_sched_out(struct perf_event *event,
- struct perf_event_context *ctx);
+static void __event_disable(struct perf_event *event,
+ struct perf_event_context *ctx,
+ enum perf_event_state state);
static void perf_put_aux_event(struct perf_event *event)
{
@@ -2139,8 +2164,7 @@ static void perf_put_aux_event(struct perf_event *event)
* state so that we don't try to schedule it again. Note
* that perf_event_enable() will clear the ERROR status.
*/
- event_sched_out(iter, ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ __event_disable(iter, ctx, PERF_EVENT_STATE_ERROR);
}
}
@@ -2198,18 +2222,6 @@ static inline struct list_head *get_event_list(struct perf_event *event)
&event->pmu_ctx->flexible_active;
}
-/*
- * Events that have PERF_EV_CAP_SIBLING require being part of a group and
- * cannot exist on their own, schedule them out and move them into the ERROR
- * state. Also see _perf_event_enable(), it will not be able to recover
- * this ERROR state.
- */
-static inline void perf_remove_sibling_event(struct perf_event *event)
-{
- event_sched_out(event, event->ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
-}
-
static void perf_group_detach(struct perf_event *event)
{
struct perf_event *leader = event->group_leader;
@@ -2245,8 +2257,15 @@ static void perf_group_detach(struct perf_event *event)
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
+ /*
+ * Events that have PERF_EV_CAP_SIBLING require being part of
+ * a group and cannot exist on their own, schedule them out
+ * and move them into the ERROR state. Also see
+ * _perf_event_enable(), it will not be able to recover this
+ * ERROR state.
+ */
if (sibling->event_caps & PERF_EV_CAP_SIBLING)
- perf_remove_sibling_event(sibling);
+ __event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR);
sibling->group_leader = sibling;
list_del_init(&sibling->sibling_list);
@@ -2506,6 +2525,15 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
event_function_call(event, __perf_remove_from_context, (void *)flags);
}
+static void __event_disable(struct perf_event *event,
+ struct perf_event_context *ctx,
+ enum perf_event_state state)
+{
+ event_sched_out(event, ctx);
+ perf_cgroup_event_disable(event, ctx);
+ perf_event_set_state(event, state);
+}
+
/*
* Cross CPU call to disable a performance event
*/
@@ -2520,13 +2548,18 @@ static void __perf_event_disable(struct perf_event *event,
perf_pmu_disable(event->pmu_ctx->pmu);
ctx_time_update_event(ctx, event);
+ /*
+ * When disabling a group leader, the whole group becomes ineligible
+ * to run, so schedule out the full group.
+ */
if (event == event->group_leader)
group_sched_out(event, ctx);
- else
- event_sched_out(event, ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_OFF);
- perf_cgroup_event_disable(event, ctx);
+ /*
+ * But only mark the leader OFF; the siblings will remain
+ * INACTIVE.
+ */
+ __event_disable(event, ctx, PERF_EVENT_STATE_OFF);
perf_pmu_enable(event->pmu_ctx->pmu);
}
@@ -3482,8 +3515,7 @@ static void __perf_event_sync_stat(struct perf_event *event,
* we know the event must be on the current CPU, therefore we
* don't need to use it.
*/
- if (event->state == PERF_EVENT_STATE_ACTIVE)
- event->pmu->read(event);
+ perf_pmu_read(event);
perf_event_update_time(event);
@@ -4634,15 +4666,8 @@ static void __perf_event_read(void *info)
pmu->read(event);
- for_each_sibling_event(sub, event) {
- if (sub->state == PERF_EVENT_STATE_ACTIVE) {
- /*
- * Use sibling's PMU rather than @event's since
- * sibling could be on different (eg: software) PMU.
- */
- sub->pmu->read(sub);
- }
- }
+ for_each_sibling_event(sub, event)
+ perf_pmu_read(sub);
data->ret = pmu->commit_txn(pmu);
@@ -6033,6 +6058,9 @@ static int perf_event_set_output(struct perf_event *event,
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static int perf_copy_attr(struct perf_event_attr __user *uattr,
struct perf_event_attr *attr);
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie);
static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
@@ -6101,7 +6129,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
if (IS_ERR(prog))
return PTR_ERR(prog);
- err = perf_event_set_bpf_prog(event, prog, 0);
+ err = __perf_event_set_bpf_prog(event, prog, 0);
if (err) {
bpf_prog_put(prog);
return err;
@@ -6589,11 +6617,21 @@ out_put:
ring_buffer_put(rb); /* could be last */
}
+static int perf_mmap_may_split(struct vm_area_struct *vma, unsigned long addr)
+{
+ /*
+ * Forbid splitting perf mappings to prevent refcount leaks due to
+ * the resulting non-matching offsets and sizes. See open()/close().
+ */
+ return -EINVAL;
+}
+
static const struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open,
.close = perf_mmap_close, /* non mergeable */
.fault = perf_mmap_fault,
.page_mkwrite = perf_mmap_fault,
+ .may_split = perf_mmap_may_split,
};
static int perf_mmap(struct file *file, struct vm_area_struct *vma)
@@ -6685,9 +6723,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
goto unlock;
}
- atomic_set(&rb->aux_mmap_count, 1);
user_extra = nr_pages;
-
goto accounting;
}
@@ -6789,8 +6825,10 @@ accounting:
} else {
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
event->attr.aux_watermark, flags);
- if (!ret)
+ if (!ret) {
+ atomic_set(&rb->aux_mmap_count, 1);
rb->aux_mmap_locked = extra;
+ }
}
unlock:
@@ -6800,6 +6838,7 @@ unlock:
atomic_inc(&event->mmap_count);
} else if (rb) {
+ /* AUX allocation failed */
atomic_dec(&rb->mmap_count);
}
aux_unlock:
@@ -6807,6 +6846,9 @@ aux_unlock:
mutex_unlock(aux_mutex);
mutex_unlock(&event->mmap_mutex);
+ if (ret)
+ return ret;
+
/*
* Since pinned accounting is per vm we cannot allow fork() to copy our
* vma.
@@ -7096,6 +7138,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size,
if (!regs)
return 0;
+ /* No mm, no stack, no dump. */
+ if (!current->mm)
+ return 0;
+
/*
* Check if we fit in with the requested stack size into the:
* - TASK_SIZE
@@ -7408,9 +7454,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
values[n++] = running;
- if ((leader != event) &&
- (leader->state == PERF_EVENT_STATE_ACTIVE))
- leader->pmu->read(leader);
+ if ((leader != event) && !handle->skip_read)
+ perf_pmu_read(leader);
values[n++] = perf_event_count(leader, self);
if (read_format & PERF_FORMAT_ID)
@@ -7423,9 +7468,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
for_each_sibling_event(sub, leader) {
n = 0;
- if ((sub != event) &&
- (sub->state == PERF_EVENT_STATE_ACTIVE))
- sub->pmu->read(sub);
+ if ((sub != event) && !handle->skip_read)
+ perf_pmu_read(sub);
values[n++] = perf_event_count(sub, self);
if (read_format & PERF_FORMAT_ID)
@@ -7484,6 +7528,9 @@ void perf_output_sample(struct perf_output_handle *handle,
{
u64 sample_type = data->type;
+ if (data->sample_flags & PERF_SAMPLE_READ)
+ handle->skip_read = 1;
+
perf_output_put(handle, *header);
if (sample_type & PERF_SAMPLE_IDENTIFIER)
@@ -7806,6 +7853,9 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
const u32 max_stack = event->attr.sample_max_stack;
struct perf_callchain_entry *callchain;
+ if (!current->mm)
+ user = false;
+
if (!kernel && !user)
return &__empty_callchain;
@@ -9716,14 +9766,14 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
hwc->interrupts = 1;
} else {
hwc->interrupts++;
- if (unlikely(throttle &&
- hwc->interrupts > max_samples_per_tick)) {
- __this_cpu_inc(perf_throttled_count);
- tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
- hwc->interrupts = MAX_INTERRUPTS;
- perf_log_throttle(event, 0);
- ret = 1;
- }
+ }
+
+ if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
+ __this_cpu_inc(perf_throttled_count);
+ tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
+ hwc->interrupts = MAX_INTERRUPTS;
+ perf_log_throttle(event, 0);
+ ret = 1;
}
if (event->attr.freq) {
@@ -10701,7 +10751,7 @@ static int perf_uprobe_event_init(struct perf_event *event)
if (event->attr.type != perf_uprobe.type)
return -ENOENT;
- if (!perfmon_capable())
+ if (!capable(CAP_SYS_ADMIN))
return -EACCES;
/*
@@ -10757,8 +10807,9 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
return false;
}
-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
- u64 bpf_cookie)
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
{
bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
@@ -10796,6 +10847,20 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
}
+int perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
+
void perf_event_free_bpf_prog(struct perf_event *event)
{
if (!perf_event_is_tracing(event)) {
@@ -10815,7 +10880,15 @@ static void perf_event_free_filter(struct perf_event *event)
{
}
-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ return -ENOENT;
+}
+
+int perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
u64 bpf_cookie)
{
return -ENOENT;
@@ -11978,40 +12051,51 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
if (ctx)
perf_event_ctx_unlock(event->group_leader, ctx);
- if (!ret) {
- if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
- has_extended_regs(event))
- ret = -EOPNOTSUPP;
+ if (ret)
+ goto err_pmu;
- if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
- event_has_any_exclude_flag(event))
- ret = -EINVAL;
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
+ has_extended_regs(event)) {
+ ret = -EOPNOTSUPP;
+ goto err_destroy;
+ }
- if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
- const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
- struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
- int cpu;
-
- if (pmu_cpumask && cpumask) {
- cpu = cpumask_any_and(pmu_cpumask, cpumask);
- if (cpu >= nr_cpu_ids)
- ret = -ENODEV;
- else
- event->event_caps |= PERF_EV_CAP_READ_SCOPE;
- } else {
- ret = -ENODEV;
- }
- }
+ if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
+ event_has_any_exclude_flag(event)) {
+ ret = -EINVAL;
+ goto err_destroy;
+ }
+
+ if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
+ const struct cpumask *cpumask;
+ struct cpumask *pmu_cpumask;
+ int cpu;
- if (ret && event->destroy)
- event->destroy(event);
+ cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
+ pmu_cpumask = perf_scope_cpumask(pmu->scope);
+
+ ret = -ENODEV;
+ if (!pmu_cpumask || !cpumask)
+ goto err_destroy;
+
+ cpu = cpumask_any_and(pmu_cpumask, cpumask);
+ if (cpu >= nr_cpu_ids)
+ goto err_destroy;
+
+ event->event_caps |= PERF_EV_CAP_READ_SCOPE;
}
- if (ret) {
- event->pmu = NULL;
- module_put(pmu->module);
+ return 0;
+
+err_destroy:
+ if (event->destroy) {
+ event->destroy(event);
+ event->destroy = NULL;
}
+err_pmu:
+ event->pmu = NULL;
+ module_put(pmu->module);
return ret;
}
@@ -13661,6 +13745,9 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
+ get_ctx(child_ctx);
+ child_event->ctx = child_ctx;
+
pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
if (IS_ERR(pmu_ctx)) {
free_event(child_event);
@@ -13683,8 +13770,6 @@ inherit_event(struct perf_event *parent_event,
return NULL;
}
- get_ctx(child_ctx);
-
/*
* Make the child state follow the state of the parent event,
* not its attr.disabled bit. We hold the parent's mutex,
@@ -13705,7 +13790,6 @@ inherit_event(struct perf_event *parent_event,
local64_set(&hwc->period_left, sample_period);
}
- child_event->ctx = child_ctx;
child_event->overflow_handler = parent_event->overflow_handler;
child_event->overflow_handler_context
= parent_event->overflow_handler_context;
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 6c2cb4e4f48d..8f3f624419aa 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -950,9 +950,10 @@ static int hw_breakpoint_event_init(struct perf_event *bp)
return -ENOENT;
/*
- * no branch sampling for breakpoint events
+ * Check if breakpoint type is supported before proceeding.
+ * Also, no branch sampling for breakpoint events.
*/
- if (has_branch_stack(bp))
+ if (!hw_breakpoint_slots_cached(find_slot_idx(bp->attr.bp_type)) || has_branch_stack(bp))
return -EOPNOTSUPP;
err = register_perf_hw_breakpoint(bp);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index bbfa22c0a159..6ecbbc57cdfd 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -185,6 +185,7 @@ __perf_output_begin(struct perf_output_handle *handle,
handle->rb = rb;
handle->event = event;
+ handle->flags = 0;
have_lost = local_read(&rb->lost);
if (unlikely(have_lost)) {
diff --git a/kernel/exit.c b/kernel/exit.c
index 619f0014c33b..d465b36bcc86 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -742,10 +742,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
tsk->exit_state = EXIT_ZOMBIE;
/*
- * sub-thread or delay_group_leader(), wake up the
- * PIDFD_THREAD waiters.
+ * Ignore thread-group leaders that exited before all
+ * subthreads did.
*/
- if (!thread_group_empty(tsk))
+ if (!delay_group_leader(tsk))
do_notify_pidfd(tsk);
if (unlikely(tsk->ptrace)) {
@@ -923,6 +923,15 @@ void __noreturn do_exit(long code)
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
+ /*
+ * Since sampling can touch ->mm, make sure to stop everything before we
+ * tear it down.
+ *
+ * Also flushes inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ */
+ perf_event_exit_task(tsk);
+
exit_mm();
if (group_dead)
@@ -939,14 +948,6 @@ void __noreturn do_exit(long code)
exit_task_work(tsk);
exit_thread(tsk);
- /*
- * Flush inherited counters to the parent - before the parent
- * gets woken up by child-exit notifications.
- *
- * because of cgroup mode, must be called before cgroup_exit()
- */
- perf_event_exit_task(tsk);
-
sched_autogroup_exit_task(tsk);
cgroup_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 12decadff468..97c9afe3efc3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -505,10 +505,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
vma_numab_state_init(new);
dup_anon_vma_name(orig, new);
- /* track_pfn_copy() will later take care of copying internal state. */
- if (unlikely(new->vm_flags & VM_PFNMAP))
- untrack_pfn_clear(new);
-
return new;
}
@@ -699,6 +695,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp = vm_area_dup(mpnt);
if (!tmp)
goto fail_nomem;
+
+ /* track_pfn_copy() will later take care of copying internal state. */
+ if (unlikely(tmp->vm_flags & VM_PFNMAP))
+ untrack_pfn_clear(tmp);
+
retval = vma_dup_policy(mpnt, tmp);
if (retval)
goto fail_nomem_policy;
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 8d530d0949ff..6a96149aede9 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -201,18 +201,9 @@ static int __restore_freezer_state(struct task_struct *p, void *arg)
void __thaw_task(struct task_struct *p)
{
- unsigned long flags;
-
- spin_lock_irqsave(&freezer_lock, flags);
- if (WARN_ON_ONCE(freezing(p)))
- goto unlock;
-
- if (!frozen(p) || task_call_func(p, __restore_freezer_state, NULL))
- goto unlock;
-
- wake_up_state(p, TASK_FROZEN);
-unlock:
- spin_unlock_irqrestore(&freezer_lock, flags);
+ guard(spinlock_irqsave)(&freezer_lock);
+ if (frozen(p) && !task_call_func(p, __restore_freezer_state, NULL))
+ wake_up_state(p, TASK_FROZEN);
}
/**
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 1a3d483548e2..ae4c9cbd1b4b 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -202,7 +202,7 @@ struct irq_domain *irq_domain_create_sim_full(struct fwnode_handle *fwnode,
void *data)
{
struct irq_sim_work_ctx *work_ctx __free(kfree) =
- kmalloc(sizeof(*work_ctx), GFP_KERNEL);
+ kzalloc(sizeof(*work_ctx), GFP_KERNEL);
if (!work_ctx)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index a83fdc053c2b..6d62fc7c1dfa 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1146,7 +1146,7 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
return false;
- if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask)
+ if (info->flags & MSI_FLAG_NO_MASK)
return false;
/*
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 117d9d4d3c3b..5121d9a6dd3b 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -533,7 +533,7 @@ static void test_barrier_nothreads(struct kunit *test)
struct kcsan_scoped_access *reorder_access = NULL;
#endif
arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED;
- atomic_t dummy;
+ atomic_t dummy = ATOMIC_INIT(0);
KCSAN_TEST_REQUIRES(test, reorder_access != NULL);
KCSAN_TEST_REQUIRES(test, IS_ENABLED(CONFIG_SMP));
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 7c6588148d42..0c746a150e34 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -231,6 +231,7 @@ comment "Do not forget to sign required modules with scripts/sign-file"
choice
prompt "Hash algorithm to sign modules"
depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ default MODULE_SIG_SHA512
help
This determines which sort of hashing algorithm will be used during
signature generation. This algorithm _must_ be built into the kernel
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 93a07387af3b..4511d0a4762a 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -703,14 +703,16 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
struct module *mod;
char name[MODULE_NAME_LEN];
char buf[MODULE_FLAGS_BUF_SIZE];
- int ret, forced = 0;
+ int ret, len, forced = 0;
if (!capable(CAP_SYS_MODULE) || modules_disabled)
return -EPERM;
- if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
- return -EFAULT;
- name[MODULE_NAME_LEN-1] = '\0';
+ len = strncpy_from_user(name, name_user, MODULE_NAME_LEN);
+ if (len == 0 || len == MODULE_NAME_LEN)
+ return -ENOENT;
+ if (len < 0)
+ return len;
audit_log_kern_module(name);
@@ -2898,7 +2900,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
module_allocated = true;
- audit_log_kern_module(mod->name);
+ audit_log_kern_module(info->name);
/* Reserve our place in the list. */
err = add_unformed_module(mod);
@@ -3058,8 +3060,10 @@ static int load_module(struct load_info *info, const char __user *uargs,
* failures once the proper module was allocated and
* before that.
*/
- if (!module_allocated)
+ if (!module_allocated) {
+ audit_log_kern_module(info->name ? info->name : "?");
mod_stat_bump_becoming(info, flags);
+ }
free_copy(info, flags);
return err;
}
diff --git a/kernel/padata.c b/kernel/padata.c
index 22770372bdf3..3e0ef0753e73 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -358,7 +358,8 @@ static void padata_reorder(struct parallel_data *pd)
* To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish.
*/
padata_get_pd(pd);
- queue_work(pinst->serial_wq, &pd->reorder_work);
+ if (!queue_work(pinst->serial_wq, &pd->reorder_work))
+ padata_put_pd(pd);
}
}
diff --git a/kernel/panic.c b/kernel/panic.c
index fbc59b3b64d0..ddad0578355b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -832,9 +832,15 @@ device_initcall(register_warn_debugfs);
*/
__visible noinstr void __stack_chk_fail(void)
{
+ unsigned long flags;
+
instrumentation_begin();
+ flags = user_access_save();
+
panic("stack-protector: Kernel stack is corrupted in: %pB",
__builtin_return_address(0));
+
+ user_access_restore(flags);
instrumentation_end();
}
EXPORT_SYMBOL(__stack_chk_fail);
diff --git a/kernel/params.c b/kernel/params.c
index 2e447f8ae183..9935ff599356 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -763,7 +763,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
params[i].ops->free(params[i].arg);
}
-static struct module_kobject * __init locate_module_kobject(const char *name)
+struct module_kobject __modinit * lookup_or_create_module_kobject(const char *name)
{
struct module_kobject *mk;
struct kobject *kobj;
@@ -805,7 +805,7 @@ static void __init kernel_add_sysfs_param(const char *name,
struct module_kobject *mk;
int err;
- mk = locate_module_kobject(name);
+ mk = lookup_or_create_module_kobject(name);
if (!mk)
return;
@@ -876,7 +876,7 @@ static void __init version_sysfs_builtin(void)
int err;
for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
- mk = locate_module_kobject(vattr->module_name);
+ mk = lookup_or_create_module_kobject(vattr->module_name);
if (mk) {
err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
WARN_ON_ONCE(err);
@@ -949,7 +949,9 @@ struct kset *module_kset;
static void module_kobj_release(struct kobject *kobj)
{
struct module_kobject *mk = to_module_kobject(kobj);
- complete(mk->kobj_completion);
+
+ if (mk->kobj_completion)
+ complete(mk->kobj_completion);
}
const struct kobj_type module_ktype = {
diff --git a/kernel/power/console.c b/kernel/power/console.c
index fcdf0e14a47d..19c48aa5355d 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -16,6 +16,7 @@
#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
static int orig_fgconsole, orig_kmsg;
+static bool vt_switch_done;
static DEFINE_MUTEX(vt_switch_mutex);
@@ -136,17 +137,21 @@ void pm_prepare_console(void)
if (orig_fgconsole < 0)
return;
+ vt_switch_done = true;
+
orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
return;
}
void pm_restore_console(void)
{
- if (!pm_vt_switch())
+ if (!pm_vt_switch() && !vt_switch_done)
return;
if (orig_fgconsole >= 0) {
vt_move_to_console(orig_fgconsole, 0);
vt_kmsg_redirect(orig_kmsg);
}
+
+ vt_switch_done = false;
}
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 927cc55ba0b3..1c9fe741fe6d 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -161,22 +161,10 @@ static void em_debug_create_pd(struct device *dev) {}
static void em_debug_remove_pd(struct device *dev) {}
#endif
-static void em_destroy_table_rcu(struct rcu_head *rp)
-{
- struct em_perf_table __rcu *table;
-
- table = container_of(rp, struct em_perf_table, rcu);
- kfree(table);
-}
-
static void em_release_table_kref(struct kref *kref)
{
- struct em_perf_table __rcu *table;
-
/* It was the last owner of this table so we can free */
- table = container_of(kref, struct em_perf_table, kref);
-
- call_rcu(&table->rcu, em_destroy_table_rcu);
+ kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu);
}
/**
@@ -185,7 +173,7 @@ static void em_release_table_kref(struct kref *kref)
*
* No return values.
*/
-void em_table_free(struct em_perf_table __rcu *table)
+void em_table_free(struct em_perf_table *table)
{
kref_put(&table->kref, em_release_table_kref);
}
@@ -198,9 +186,9 @@ void em_table_free(struct em_perf_table __rcu *table)
* has a user.
* Returns allocated table or NULL.
*/
-struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd)
+struct em_perf_table *em_table_alloc(struct em_perf_domain *pd)
{
- struct em_perf_table __rcu *table;
+ struct em_perf_table *table;
int table_size;
table_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
@@ -245,6 +233,10 @@ static int em_compute_costs(struct device *dev, struct em_perf_state *table,
unsigned long prev_cost = ULONG_MAX;
int i, ret;
+ /* This is needed only for CPUs and EAS skip other devices */
+ if (!_is_cpu_device(dev))
+ return 0;
+
/* Compute the cost of each performance state. */
for (i = nr_states - 1; i >= 0; i--) {
unsigned long power_res, cost;
@@ -308,9 +300,9 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
* Return 0 on success or an error code on failure.
*/
int em_dev_update_perf_domain(struct device *dev,
- struct em_perf_table __rcu *new_table)
+ struct em_perf_table *new_table)
{
- struct em_perf_table __rcu *old_table;
+ struct em_perf_table *old_table;
struct em_perf_domain *pd;
if (!dev)
@@ -327,7 +319,8 @@ int em_dev_update_perf_domain(struct device *dev,
kref_get(&new_table->kref);
- old_table = pd->em_table;
+ old_table = rcu_dereference_protected(pd->em_table,
+ lockdep_is_held(&em_pd_mutex));
rcu_assign_pointer(pd->em_table, new_table);
em_cpufreq_update_efficiencies(dev, new_table->state);
@@ -399,7 +392,7 @@ static int em_create_pd(struct device *dev, int nr_states,
struct em_data_callback *cb, cpumask_t *cpus,
unsigned long flags)
{
- struct em_perf_table __rcu *em_table;
+ struct em_perf_table *em_table;
struct em_perf_domain *pd;
struct device *cpu_dev;
int cpu, ret, num_cpus;
@@ -559,6 +552,7 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
struct em_data_callback *cb, cpumask_t *cpus,
bool microwatts)
{
+ struct em_perf_table *em_table;
unsigned long cap, prev_cap = 0;
unsigned long flags = 0;
int cpu, ret;
@@ -629,7 +623,9 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
dev->em_pd->flags |= flags;
- em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state);
+ em_table = rcu_dereference_protected(dev->em_pd->em_table,
+ lockdep_is_held(&em_pd_mutex));
+ em_cpufreq_update_efficiencies(dev, em_table->state);
em_debug_create_pd(dev);
dev_info(dev, "EM: created perf domain\n");
@@ -666,7 +662,8 @@ void em_dev_unregister_perf_domain(struct device *dev)
mutex_lock(&em_pd_mutex);
em_debug_remove_pd(dev);
- em_table_free(dev->em_pd->em_table);
+ em_table_free(rcu_dereference_protected(dev->em_pd->em_table,
+ lockdep_is_held(&em_pd_mutex)));
kfree(dev->em_pd);
dev->em_pd = NULL;
@@ -674,9 +671,9 @@ void em_dev_unregister_perf_domain(struct device *dev)
}
EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);
-static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd)
+static struct em_perf_table *em_table_dup(struct em_perf_domain *pd)
{
- struct em_perf_table __rcu *em_table;
+ struct em_perf_table *em_table;
struct em_perf_state *ps, *new_ps;
int ps_size;
@@ -698,7 +695,7 @@ static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd)
}
static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd,
- struct em_perf_table __rcu *em_table)
+ struct em_perf_table *em_table)
{
int ret;
@@ -729,7 +726,7 @@ static void em_adjust_new_capacity(struct device *dev,
struct em_perf_domain *pd,
u64 max_cap)
{
- struct em_perf_table __rcu *em_table;
+ struct em_perf_table *em_table;
em_table = em_table_dup(pd);
if (!em_table) {
@@ -820,7 +817,7 @@ static void em_update_workfn(struct work_struct *work)
*/
int em_dev_update_chip_binning(struct device *dev)
{
- struct em_perf_table __rcu *em_table;
+ struct em_perf_table *em_table;
struct em_perf_domain *pd;
int i, ret;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index d8bad1eeedd3..85008ead2ac9 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -89,6 +89,11 @@ void hibernate_release(void)
atomic_inc(&hibernate_atomic);
}
+bool hibernation_in_progress(void)
+{
+ return !atomic_read(&hibernate_atomic);
+}
+
bool hibernation_available(void)
{
return nohibernate == 0 &&
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6254814d4817..0622e7dacf17 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -613,7 +613,8 @@ bool pm_debug_messages_on __read_mostly;
bool pm_debug_messages_should_print(void)
{
- return pm_debug_messages_on && pm_suspend_target_state != PM_SUSPEND_ON;
+ return pm_debug_messages_on && (hibernation_in_progress() ||
+ pm_suspend_target_state != PM_SUSPEND_ON);
}
EXPORT_SYMBOL_GPL(pm_debug_messages_should_print);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index de0e6b1077f2..6d1ec7b23e84 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -71,10 +71,14 @@ extern void enable_restore_image_protection(void);
static inline void enable_restore_image_protection(void) {}
#endif /* CONFIG_STRICT_KERNEL_RWX */
+extern bool hibernation_in_progress(void);
+
#else /* !CONFIG_HIBERNATION */
static inline void hibernate_reserved_size_init(void) {}
static inline void hibernate_image_size_init(void) {}
+
+static inline bool hibernation_in_progress(void) { return false; }
#endif /* !CONFIG_HIBERNATION */
#define power_attr(_name) \
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 52571dcad768..4e941999a53b 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -49,6 +49,9 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active)
len += sysfs_emit_at(buf, len, "%s ", wl->name);
}
+ if (len > 0)
+ --len;
+
len += sysfs_emit_at(buf, len, "\n");
mutex_unlock(&wakelocks_lock);
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index fd12efcc4aed..e7a3af81b173 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -214,8 +214,9 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
/**
* nbcon_context_try_acquire_direct - Try to acquire directly
- * @ctxt: The context of the caller
- * @cur: The current console state
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ * @is_reacquire: This acquire is a reacquire
*
* Acquire the console when it is released. Also acquire the console when
* the current owner has a lower priority and the console is in a safe state.
@@ -225,17 +226,17 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
*
* Errors:
*
- * -EPERM: A panic is in progress and this is not the panic CPU.
- * Or the current owner or waiter has the same or higher
- * priority. No acquire method can be successful in
- * this case.
+ * -EPERM: A panic is in progress and this is neither the panic
+ * CPU nor is this a reacquire. Or the current owner or
+ * waiter has the same or higher priority. No acquire
+ * method can be successful in these cases.
*
* -EBUSY: The current owner has a lower priority but the console
* in an unsafe state. The caller should try using
* the handover acquire method.
*/
static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
- struct nbcon_state *cur)
+ struct nbcon_state *cur, bool is_reacquire)
{
unsigned int cpu = smp_processor_id();
struct console *con = ctxt->console;
@@ -243,14 +244,20 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
do {
/*
- * Panic does not imply that the console is owned. However, it
- * is critical that non-panic CPUs during panic are unable to
- * acquire ownership in order to satisfy the assumptions of
- * nbcon_waiter_matches(). In particular, the assumption that
- * lower priorities are ignored during panic.
+ * Panic does not imply that the console is owned. However,
+ * since all non-panic CPUs are stopped during panic(), it
+ * is safer to have them avoid gaining console ownership.
+ *
+ * If this acquire is a reacquire (and an unsafe takeover
+ * has not previously occurred) then it is allowed to attempt
+ * a direct acquire in panic. This gives console drivers an
+ * opportunity to perform any necessary cleanup if they were
+ * interrupted by the panic CPU while printing.
*/
- if (other_cpu_in_panic())
+ if (other_cpu_in_panic() &&
+ (!is_reacquire || cur->unsafe_takeover)) {
return -EPERM;
+ }
if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio)
return -EPERM;
@@ -301,8 +308,9 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio)
* Event #1 implies this context is EMERGENCY.
* Event #2 implies the new context is PANIC.
* Event #3 occurs when panic() has flushed the console.
- * Events #4 and #5 are not possible due to the other_cpu_in_panic()
- * check in nbcon_context_try_acquire_direct().
+ * Event #4 occurs when a non-panic CPU reacquires.
+ * Event #5 is not possible due to the other_cpu_in_panic() check
+ * in nbcon_context_try_acquire_handover().
*/
return (cur->req_prio == expected_prio);
@@ -431,6 +439,16 @@ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt,
WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
WARN_ON_ONCE(!cur->unsafe);
+ /*
+ * Panic does not imply that the console is owned. However, it
+ * is critical that non-panic CPUs during panic are unable to
+ * wait for a handover in order to satisfy the assumptions of
+ * nbcon_waiter_matches(). In particular, the assumption that
+ * lower priorities are ignored during panic.
+ */
+ if (other_cpu_in_panic())
+ return -EPERM;
+
/* Handover is not possible on the same CPU. */
if (cur->cpu == cpu)
return -EBUSY;
@@ -558,7 +576,8 @@ static struct printk_buffers panic_nbcon_pbufs;
/**
* nbcon_context_try_acquire - Try to acquire nbcon console
- * @ctxt: The context of the caller
+ * @ctxt: The context of the caller
+ * @is_reacquire: This acquire is a reacquire
*
* Context: Under @ctxt->con->device_lock() or local_irq_save().
* Return: True if the console was acquired. False otherwise.
@@ -568,7 +587,7 @@ static struct printk_buffers panic_nbcon_pbufs;
* in an unsafe state. Otherwise, on success the caller may assume
* the console is not in an unsafe state.
*/
-static bool nbcon_context_try_acquire(struct nbcon_context *ctxt)
+static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire)
{
unsigned int cpu = smp_processor_id();
struct console *con = ctxt->console;
@@ -577,7 +596,7 @@ static bool nbcon_context_try_acquire(struct nbcon_context *ctxt)
nbcon_state_read(con, &cur);
try_again:
- err = nbcon_context_try_acquire_direct(ctxt, &cur);
+ err = nbcon_context_try_acquire_direct(ctxt, &cur, is_reacquire);
if (err != -EBUSY)
goto out;
@@ -913,7 +932,7 @@ void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt)
{
struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
- while (!nbcon_context_try_acquire(ctxt))
+ while (!nbcon_context_try_acquire(ctxt, true))
cpu_relax();
nbcon_write_context_set_buf(wctxt, NULL, 0);
@@ -1101,7 +1120,7 @@ static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic)
cant_migrate();
}
- if (!nbcon_context_try_acquire(ctxt))
+ if (!nbcon_context_try_acquire(ctxt, false))
goto out;
/*
@@ -1486,7 +1505,7 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
ctxt->prio = nbcon_get_default_prio();
ctxt->allow_unsafe_takeover = allow_unsafe_takeover;
- if (!nbcon_context_try_acquire(ctxt))
+ if (!nbcon_context_try_acquire(ctxt, false))
return -EPERM;
while (nbcon_seq_read(con) < stop_seq) {
@@ -1762,7 +1781,7 @@ bool nbcon_device_try_acquire(struct console *con)
ctxt->console = con;
ctxt->prio = NBCON_PRIO_NORMAL;
- if (!nbcon_context_try_acquire(ctxt))
+ if (!nbcon_context_try_acquire(ctxt, false))
return false;
if (!nbcon_context_enter_unsafe(ctxt))
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 881a26e18c65..3a91b739e8f3 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3310,7 +3310,12 @@ void console_unblank(void)
*/
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
- if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) {
+ short flags = console_srcu_read_flags(c);
+
+ if (flags & CON_SUSPENDED)
+ continue;
+
+ if ((flags & CON_ENABLED) && c->unblank) {
found_unblank = true;
break;
}
@@ -3347,7 +3352,12 @@ void console_unblank(void)
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
- if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank)
+ short flags = console_srcu_read_flags(c);
+
+ if (flags & CON_SUSPENDED)
+ continue;
+
+ if ((flags & CON_ENABLED) && c->unblank)
c->unblank();
}
console_srcu_read_unlock(cookie);
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index feb3ac1dc5d5..f87c9d6d36fc 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -162,7 +162,7 @@ static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s)
{
unsigned long cur_s = READ_ONCE(*sp);
- return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1));
+ return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (3 * RCU_SEQ_STATE_MASK + 1));
}
/*
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 0db9db73f57f..36b78d5a0675 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -81,7 +81,7 @@ torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
// Number of typesafe_lookup structures, that is, the degree of concurrency.
torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures.");
// Number of loops per experiment, all readers execute operations concurrently.
-torture_param(long, loops, 10000, "Number of loops per experiment.");
+torture_param(int, loops, 10000, "Number of loops per experiment.");
// Number of readers, with -1 defaulting to about 75% of the CPUs.
torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
// Number of runs.
@@ -1029,7 +1029,7 @@ static void
ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag)
{
pr_alert("%s" SCALE_FLAG
- "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
+ "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%d nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay);
}
@@ -1126,12 +1126,16 @@ ref_scale_init(void)
// Reader tasks (default to ~75% of online CPUs).
if (nreaders < 0)
nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
- if (WARN_ONCE(loops <= 0, "%s: loops = %ld, adjusted to 1\n", __func__, loops))
+ if (WARN_ONCE(loops <= 0, "%s: loops = %d, adjusted to 1\n", __func__, loops))
loops = 1;
if (WARN_ONCE(nreaders <= 0, "%s: nreaders = %d, adjusted to 1\n", __func__, nreaders))
nreaders = 1;
if (WARN_ONCE(nruns <= 0, "%s: nruns = %d, adjusted to 1\n", __func__, nruns))
nruns = 1;
+ if (WARN_ONCE(loops > INT_MAX / nreaders,
+ "%s: nreaders * loops will overflow, adjusted loops to %d",
+ __func__, INT_MAX / nreaders))
+ loops = INT_MAX / nreaders;
reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
if (!reader_tasks) {
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8e52c1dd0628..7b073b8b5e91 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -802,6 +802,10 @@ static int rcu_watching_snap_save(struct rcu_data *rdp)
return 0;
}
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+
/*
* Returns positive if the specified CPU has passed through a quiescent state
* by virtue of being in or having passed through an dynticks idle state since
@@ -937,9 +941,9 @@ static int rcu_watching_snap_recheck(struct rcu_data *rdp)
rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
- rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu);
- rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu);
- rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu);
+ rsrp->nr_hardirqs = kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu);
+ rsrp->nr_softirqs = kstat_cpu_softirqs_sum(cpu);
+ rsrp->nr_csw = nr_context_switches_cpu(cpu);
rsrp->jiffies = jiffies;
rsrp->gp_seq = rdp->gp_seq;
}
@@ -1822,10 +1826,14 @@ static noinline_for_stack bool rcu_gp_init(void)
/* Advance to a new grace period and initialize state. */
record_gp_stall_check_time();
+ /*
+ * A new wait segment must be started before gp_seq advanced, so
+ * that previous gp waiters won't observe the new gp_seq.
+ */
+ start_new_poll = rcu_sr_normal_gp_init();
/* Record GP times before starting GP, hence rcu_seq_start(). */
rcu_seq_start(&rcu_state.gp_seq);
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
- start_new_poll = rcu_sr_normal_gp_init();
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
@@ -3068,6 +3076,10 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
/* Misaligned rcu_head! */
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
+ /* Avoid NULL dereference if callback is NULL. */
+ if (WARN_ON_ONCE(!func))
+ return;
+
if (debug_rcu_head_queue(head)) {
/*
* Probable double call_rcu(), so leak the callback.
@@ -4183,14 +4195,17 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
*/
void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
{
- struct rcu_node *rnp = rcu_get_root();
-
/*
* Any prior manipulation of RCU-protected data must happen
* before the loads from ->gp_seq and ->expedited_sequence.
*/
smp_mb(); /* ^^^ */
- rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq);
+
+ // Yes, rcu_state.gp_seq, not rnp_root->gp_seq, the latter's use
+ // in poll_state_synchronize_rcu_full() notwithstanding. Use of
+ // the latter here would result in too-short grace periods due to
+ // interactions with newly onlined CPUs.
+ rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq);
rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence);
}
EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
@@ -5012,6 +5027,8 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ rcu_preempt_deferred_qs_init(rdp);
rcu_spawn_rnp_kthreads(rnp);
rcu_spawn_cpu_nocb_kthread(cpu);
ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index a9a811d9d7a3..8ba04b179416 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -168,12 +168,23 @@ struct rcu_snap_record {
u64 cputime_irq; /* Accumulated cputime of hard irqs */
u64 cputime_softirq;/* Accumulated cputime of soft irqs */
u64 cputime_system; /* Accumulated cputime of kernel tasks */
- unsigned long nr_hardirqs; /* Accumulated number of hard irqs */
+ u64 nr_hardirqs; /* Accumulated number of hard irqs */
unsigned int nr_softirqs; /* Accumulated number of soft irqs */
unsigned long long nr_csw; /* Accumulated number of task switches */
unsigned long jiffies; /* Track jiffies value */
};
+/*
+ * An IRQ work (deferred_qs_iw) is used by RCU to get the scheduler's attention.
+ * to report quiescent states at the soonest possible time.
+ * The request can be in one of the following states:
+ * - DEFER_QS_IDLE: An IRQ work is yet to be scheduled.
+ * - DEFER_QS_PENDING: An IRQ work was scheduled but either not yet run, or it
+ * ran and we still haven't reported a quiescent state.
+ */
+#define DEFER_QS_IDLE 0
+#define DEFER_QS_PENDING 1
+
/* Per-CPU data for read-copy update. */
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
@@ -191,7 +202,7 @@ struct rcu_data {
/* during and after the last grace */
/* period it is aware of. */
struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */
- bool defer_qs_iw_pending; /* Scheduler attention pending? */
+ int defer_qs_iw_pending; /* Scheduler attention pending? */
struct work_struct strict_work; /* Schedule readers for strict GPs. */
/* 2) batch handling */
@@ -476,6 +487,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
static void rcu_flavor_sched_clock_irq(int user);
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
static bool rcu_is_callbacks_kthread(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 2605dd234a13..a1a5942d7017 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -276,7 +276,7 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
* callback storms, no need to wake up too early.
*/
if (waketype == RCU_NOCB_WAKE_LAZY &&
- rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
+ rdp_gp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush());
WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
} else if (waketype == RCU_NOCB_WAKE_BYPASS) {
@@ -1152,7 +1152,6 @@ static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp)
static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
{
int wake_gp;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
WARN_ON_ONCE(cpu_online(rdp->cpu));
/*
@@ -1162,7 +1161,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
if (!rdp->nocb_gp_rdp)
return -EINVAL;
- if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread))
+ if (WARN_ON_ONCE(!rdp->nocb_gp_kthread))
return -EINVAL;
pr_info("Offloading %d\n", rdp->cpu);
@@ -1172,7 +1171,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
wake_gp = rcu_nocb_queue_toggle_rdp(rdp);
if (wake_gp)
- wake_up_process(rdp_gp->nocb_gp_kthread);
+ wake_up_process(rdp->nocb_gp_kthread);
swait_event_exclusive(rdp->nocb_state_wq,
rcu_nocb_rdp_offload_wait_cond(rdp));
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 1c7cbd145d5e..2d865b2096be 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -485,13 +485,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
struct rcu_node *rnp;
union rcu_special special;
+ rdp = this_cpu_ptr(&rcu_data);
+ if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING)
+ rdp->defer_qs_iw_pending = DEFER_QS_IDLE;
+
/*
* If RCU core is waiting for this CPU to exit its critical section,
* report the fact that it has exited. Because irqs are disabled,
* t->rcu_read_unlock_special cannot change.
*/
special = t->rcu_read_unlock_special;
- rdp = this_cpu_ptr(&rcu_data);
if (!special.s && !rdp->cpu_no_qs.b.exp) {
local_irq_restore(flags);
return;
@@ -623,10 +626,29 @@ notrace void rcu_preempt_deferred_qs(struct task_struct *t)
*/
static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
{
+ unsigned long flags;
struct rcu_data *rdp;
rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
- rdp->defer_qs_iw_pending = false;
+ local_irq_save(flags);
+
+ /*
+ * If the IRQ work handler happens to run in the middle of RCU read-side
+ * critical section, it could be ineffective in getting the scheduler's
+ * attention to report a deferred quiescent state (the whole point of the
+ * IRQ work). For this reason, requeue the IRQ work.
+ *
+ * Basically, we want to avoid following situation:
+ * 1. rcu_read_unlock() queues IRQ work (state -> DEFER_QS_PENDING)
+ * 2. CPU enters new rcu_read_lock()
+ * 3. IRQ work runs but cannot report QS due to rcu_preempt_depth() > 0
+ * 4. rcu_read_unlock() does not re-queue work (state still PENDING)
+ * 5. Deferred QS reporting does not happen.
+ */
+ if (rcu_preempt_depth() > 0)
+ WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE);
+
+ local_irq_restore(flags);
}
/*
@@ -672,17 +694,11 @@ static void rcu_read_unlock_special(struct task_struct *t)
set_tsk_need_resched(current);
set_preempt_need_resched();
if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
- expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) {
+ expboost && rdp->defer_qs_iw_pending != DEFER_QS_PENDING &&
+ cpu_online(rdp->cpu)) {
// Get scheduler to re-evaluate and call hooks.
// If !IRQ_WORK, FQS scan will eventually IPI.
- if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
- IS_ENABLED(CONFIG_PREEMPT_RT))
- rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(
- rcu_preempt_deferred_qs_handler);
- else
- init_irq_work(&rdp->defer_qs_iw,
- rcu_preempt_deferred_qs_handler);
- rdp->defer_qs_iw_pending = true;
+ rdp->defer_qs_iw_pending = DEFER_QS_PENDING;
irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
}
}
@@ -821,6 +837,10 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
}
}
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp)
+{
+ rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler);
+}
#else /* #ifdef CONFIG_PREEMPT_RCU */
/*
@@ -832,8 +852,17 @@ void rcu_read_unlock_strict(void)
{
struct rcu_data *rdp;
- if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
+ if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread)
return;
+
+ /*
+ * rcu_report_qs_rdp() can only be invoked with a stable rdp and
+ * from the local CPU.
+ *
+ * The in_atomic_preempt_off() check ensures that we come here holding
+ * the last preempt_count (which will get dropped once we return to
+ * __rcu_read_unlock().
+ */
rdp = this_cpu_ptr(&rcu_data);
rdp->cpu_no_qs.b.norm = false;
rcu_report_qs_rdp(rdp);
@@ -974,13 +1003,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
*/
static void rcu_flavor_sched_clock_irq(int user)
{
- if (user || rcu_is_cpu_rrupt_from_idle()) {
+ if (user || rcu_is_cpu_rrupt_from_idle() ||
+ (IS_ENABLED(CONFIG_PREEMPT_COUNT) &&
+ (preempt_count() == HARDIRQ_OFFSET))) {
/*
* Get here if this CPU took its interrupt from user
- * mode or from the idle loop, and if this is not a
- * nested interrupt. In this case, the CPU is in
- * a quiescent state, so note it.
+ * mode, from the idle loop without this being a nested
+ * interrupt, or while not holding the task preempt count
+ * (with PREEMPT_COUNT=y). In this case, the CPU is in a
+ * quiescent state, so note it.
*
* No memory barrier is required here because rcu_qs()
* references only CPU-local variables that other CPUs
@@ -1008,6 +1040,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks));
}
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { }
+
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/*
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 4432db6d0b99..4d524a2212a8 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -457,8 +457,8 @@ static void print_cpu_stat_info(int cpu)
rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
pr_err("\t hardirqs softirqs csw/system\n");
- pr_err("\t number: %8ld %10d %12lld\n",
- kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs,
+ pr_err("\t number: %8lld %10d %12lld\n",
+ kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu) - rsrp->nr_hardirqs,
kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs,
nr_context_switches_cpu(cpu) - rsrp->nr_csw);
pr_err("\tcputime: %8lld %10lld %12lld ==> %d(ms)\n",
diff --git a/kernel/resource.c b/kernel/resource.c
index 4101016e8b20..1d48ae864635 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1268,8 +1268,9 @@ static int __request_region_locked(struct resource *res, struct resource *parent
* become unavailable to other users. Conflicts are
* not expected. Warn to aid debugging if encountered.
*/
- if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
- pr_warn("Unaddressable device %s %pR conflicts with %pR",
+ if (parent == &iomem_resource &&
+ conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
+ pr_warn("Unaddressable device %s %pR conflicts with %pR\n",
conflict->name, conflict, res);
}
if (conflict != parent) {
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 9de6e35fe679..23894ba8250c 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -149,6 +149,29 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
return 0;
}
+/*
+ * Get the user-space pointer value stored in the 'rseq_cs' field.
+ */
+static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs)
+{
+ if (!rseq_cs)
+ return -EFAULT;
+
+#ifdef CONFIG_64BIT
+ if (get_user(*rseq_cs, &rseq->rseq_cs))
+ return -EFAULT;
+#else
+ if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs)))
+ return -EFAULT;
+#endif
+
+ return 0;
+}
+
+/*
+ * If the rseq_cs field of 'struct rseq' contains a valid pointer to
+ * user-space, copy 'struct rseq_cs' from user-space and validate its fields.
+ */
static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
{
struct rseq_cs __user *urseq_cs;
@@ -157,17 +180,16 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
u32 sig;
int ret;
-#ifdef CONFIG_64BIT
- if (get_user(ptr, &t->rseq->rseq_cs))
- return -EFAULT;
-#else
- if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr)))
- return -EFAULT;
-#endif
+ ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr);
+ if (ret)
+ return ret;
+
+ /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */
if (!ptr) {
memset(rseq_cs, 0, sizeof(*rseq_cs));
return 0;
}
+ /* Check that the pointer value fits in the user-space process space. */
if (ptr >= TASK_SIZE)
return -EINVAL;
urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
@@ -243,7 +265,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
return !!event_mask;
}
-static int clear_rseq_cs(struct task_struct *t)
+static int clear_rseq_cs(struct rseq __user *rseq)
{
/*
* The rseq_cs field is set to NULL on preemption or signal
@@ -254,9 +276,9 @@ static int clear_rseq_cs(struct task_struct *t)
* Set rseq_cs to NULL.
*/
#ifdef CONFIG_64BIT
- return put_user(0UL, &t->rseq->rseq_cs);
+ return put_user(0UL, &rseq->rseq_cs);
#else
- if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs)))
+ if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs)))
return -EFAULT;
return 0;
#endif
@@ -288,11 +310,11 @@ static int rseq_ip_fixup(struct pt_regs *regs)
* Clear the rseq_cs pointer and return.
*/
if (!in_rseq_cs(ip, &rseq_cs))
- return clear_rseq_cs(t);
+ return clear_rseq_cs(t->rseq);
ret = rseq_need_restart(t, rseq_cs.flags);
if (ret <= 0)
return ret;
- ret = clear_rseq_cs(t);
+ ret = clear_rseq_cs(t->rseq);
if (ret)
return ret;
trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
@@ -366,6 +388,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
int, flags, u32, sig)
{
int ret;
+ u64 rseq_cs;
if (flags & RSEQ_FLAG_UNREGISTER) {
if (flags & ~RSEQ_FLAG_UNREGISTER)
@@ -420,6 +443,19 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
return -EINVAL;
if (!access_ok(rseq, rseq_len))
return -EFAULT;
+
+ /*
+ * If the rseq_cs pointer is non-NULL on registration, clear it to
+ * avoid a potential segfault on return to user-space. The proper thing
+ * to do would have been to fail the registration but this would break
+ * older libcs that reuse the rseq area for new threads without
+ * clearing the fields.
+ */
+ if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs))
+ return -EFAULT;
+ if (rseq_cs && clear_rseq_cs(rseq))
+ return -EFAULT;
+
current->rseq = rseq;
current->rseq_len = rseq_len;
current->rseq_sig = sig;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e9bb1b4c5842..4b1953b6c76a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1303,7 +1303,7 @@ bool sched_can_stop_tick(struct rq *rq)
if (scx_enabled() && !scx_can_stop_tick(rq))
return false;
- if (rq->cfs.h_nr_running > 1)
+ if (rq->cfs.h_nr_queued > 1)
return false;
/*
@@ -2229,6 +2229,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
* just go back and repeat.
*/
rq = task_rq_lock(p, &rf);
+ /*
+ * If task is sched_delayed, force dequeue it, to avoid always
+ * hitting the tick timeout in the queued case
+ */
+ if (p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
trace_sched_wait_task(p);
running = task_on_cpu(rq, p);
queued = task_on_rq_queued(p);
@@ -3885,6 +3891,11 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
if (task_on_scx(p))
return false;
+#ifdef CONFIG_SMP
+ if (p->sched_class == &stop_sched_class)
+ return false;
+#endif
+
/*
* Do not complicate things with the async wake_list while the CPU is
* in hotplug state.
@@ -5970,7 +5981,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* opportunity to pull in more work from other CPUs.
*/
if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
- rq->nr_running == rq->cfs.h_nr_running)) {
+ rq->nr_running == rq->cfs.h_nr_queued)) {
p = pick_next_task_fair(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
@@ -6517,12 +6528,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* Otherwise marks the task's __state as RUNNING
*/
static bool try_to_block_task(struct rq *rq, struct task_struct *p,
- unsigned long task_state)
+ unsigned long *task_state_p)
{
+ unsigned long task_state = *task_state_p;
int flags = DEQUEUE_NOCLOCK;
if (signal_pending_state(task_state, p)) {
WRITE_ONCE(p->__state, TASK_RUNNING);
+ *task_state_p = TASK_RUNNING;
return false;
}
@@ -6656,7 +6669,7 @@ static void __sched notrace __schedule(int sched_mode)
goto picked;
}
} else if (!preempt && prev_state) {
- try_to_block_task(rq, prev, prev_state);
+ try_to_block_task(rq, prev, &prev_state);
switch_count = &prev->nvcsw;
}
@@ -8415,7 +8428,7 @@ void __init sched_init(void)
init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_EXT_GROUP_SCHED
- root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
+ scx_tg_init(&root_task_group);
#endif /* CONFIG_EXT_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -8855,7 +8868,7 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
- scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
+ scx_tg_init(tg);
alloc_uclamp_sched_group(tg, parent);
return tg;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e51d5ce730be..e5ced97d9681 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -81,9 +81,20 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
if (!cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
- if (unlikely(sg_policy->limits_changed)) {
- sg_policy->limits_changed = false;
- sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
+ if (unlikely(READ_ONCE(sg_policy->limits_changed))) {
+ WRITE_ONCE(sg_policy->limits_changed, false);
+ sg_policy->need_freq_update = true;
+
+ /*
+ * The above limits_changed update must occur before the reads
+ * of policy limits in cpufreq_driver_resolve_freq() or a policy
+ * limits update might be missed, so use a memory barrier to
+ * ensure it.
+ *
+ * This pairs with the write memory barrier in sugov_limits().
+ */
+ smp_mb();
+
return true;
}
@@ -95,10 +106,22 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
- if (sg_policy->need_freq_update)
+ if (sg_policy->need_freq_update) {
sg_policy->need_freq_update = false;
- else if (sg_policy->next_freq == next_freq)
+ /*
+ * The policy limits have changed, but if the return value of
+ * cpufreq_driver_resolve_freq() after applying the new limits
+ * is still equal to the previously selected frequency, the
+ * driver callback need not be invoked unless the driver
+ * specifically wants that to happen on every update of the
+ * policy limits.
+ */
+ if (sg_policy->next_freq == next_freq &&
+ !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS))
+ return false;
+ } else if (sg_policy->next_freq == next_freq) {
return false;
+ }
sg_policy->next_freq = next_freq;
sg_policy->last_freq_update_time = time;
@@ -365,7 +388,7 @@ static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; }
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
{
if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min)
- sg_cpu->sg_policy->limits_changed = true;
+ WRITE_ONCE(sg_cpu->sg_policy->limits_changed, true);
}
static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
@@ -888,7 +911,16 @@ static void sugov_limits(struct cpufreq_policy *policy)
mutex_unlock(&sg_policy->work_lock);
}
- sg_policy->limits_changed = true;
+ /*
+ * The limits_changed update below must take place before the updates
+ * of policy limits in cpufreq_set_policy() or a policy limits update
+ * might be missed, so use a memory barrier to ensure it.
+ *
+ * This pairs with the memory barrier in sugov_should_update_freq().
+ */
+ smp_wmb();
+
+ WRITE_ONCE(sg_policy->limits_changed, true);
}
struct cpufreq_governor schedutil_gov = {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5e7ae404c8d2..53e3670fbb1e 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1485,7 +1485,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
if (dl_entity_is_special(dl_se))
return;
- scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec);
+ scaled_delta_exec = delta_exec;
+ if (!dl_server(dl_se))
+ scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec);
dl_se->runtime -= scaled_delta_exec;
@@ -1592,7 +1594,7 @@ throttle:
*/
void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
{
- s64 delta_exec, scaled_delta_exec;
+ s64 delta_exec;
if (!rq->fair_server.dl_defer)
return;
@@ -1605,9 +1607,7 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
if (delta_exec < 0)
return;
- scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec);
-
- rq->fair_server.runtime -= scaled_delta_exec;
+ rq->fair_server.runtime -= delta_exec;
if (rq->fair_server.runtime < 0) {
rq->fair_server.dl_defer_running = 0;
@@ -3227,6 +3227,9 @@ void sched_dl_do_global(void)
if (global_rt_runtime() != RUNTIME_INF)
new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+ for_each_possible_cpu(cpu)
+ init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
+
for_each_possible_cpu(cpu) {
rcu_read_lock_sched();
@@ -3242,7 +3245,6 @@ void sched_dl_do_global(void)
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
- init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
}
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1e3bc0774efd..9815f9a0cd59 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -378,7 +378,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
return -EINVAL;
}
- if (rq->cfs.h_nr_running) {
+ if (rq->cfs.h_nr_queued) {
update_rq_clock(rq);
dl_server_stop(&rq->fair_server);
}
@@ -391,7 +391,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
cpu_of(rq));
- if (rq->cfs.h_nr_running)
+ if (rq->cfs.h_nr_queued)
dl_server_start(&rq->fair_server);
}
@@ -843,7 +843,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
spread = right_vruntime - left_vruntime;
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
- SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed);
SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running",
cfs_rq->idle_nr_running);
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index fcf968490308..563a7dc2ece6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3918,6 +3918,11 @@ static void scx_cgroup_warn_missing_idle(struct task_group *tg)
cgroup_warned_missing_idle = true;
}
+void scx_tg_init(struct task_group *tg)
+{
+ tg->scx_weight = CGROUP_WEIGHT_DFL;
+}
+
int scx_tg_online(struct task_group *tg)
{
int ret = 0;
@@ -4053,12 +4058,12 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
{
percpu_down_read(&scx_cgroup_rwsem);
- if (scx_cgroup_enabled && tg->scx_weight != weight) {
- if (SCX_HAS_OP(cgroup_set_weight))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight,
- tg_cgrp(tg), weight);
- tg->scx_weight = weight;
- }
+ if (scx_cgroup_enabled && SCX_HAS_OP(cgroup_set_weight) &&
+ tg->scx_weight != weight)
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight,
+ tg_cgrp(tg), weight);
+
+ tg->scx_weight = weight;
percpu_up_read(&scx_cgroup_rwsem);
}
@@ -4530,7 +4535,7 @@ unlock:
static void free_exit_info(struct scx_exit_info *ei)
{
- kfree(ei->dump);
+ kvfree(ei->dump);
kfree(ei->msg);
kfree(ei->bt);
kfree(ei);
@@ -4546,7 +4551,7 @@ static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL);
ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
- ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);
+ ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL);
if (!ei->bt || !ei->msg || !ei->dump) {
free_exit_info(ei);
@@ -5215,6 +5220,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
for_each_possible_cpu(cpu)
cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE;
+ if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
+ reset_idle_masks();
+ static_branch_enable(&scx_builtin_idle_enabled);
+ } else {
+ static_branch_disable(&scx_builtin_idle_enabled);
+ }
+
/*
* Keep CPUs stable during enable so that the BPF scheduler can track
* online CPUs by watching ->on/offline_cpu() after ->init().
@@ -5282,13 +5294,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (scx_ops.cpu_acquire || scx_ops.cpu_release)
static_branch_enable(&scx_ops_cpu_preempt);
- if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
- reset_idle_masks();
- static_branch_enable(&scx_builtin_idle_enabled);
- } else {
- static_branch_disable(&scx_builtin_idle_enabled);
- }
-
/*
* Lock out forks, cgroup on/offlining and moves before opening the
* floodgate so that they don't wander into the operations prematurely.
@@ -5367,6 +5372,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
__setscheduler_class(p->policy, p->prio);
struct sched_enq_and_set_ctx ctx;
+ if (!tryget_task_struct(p))
+ continue;
+
if (old_class != new_class && p->se.sched_delayed)
dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
@@ -5379,6 +5387,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
sched_enq_and_set_task(&ctx);
check_class_changed(task_rq(p), p, old_class, p->prio);
+ put_task_struct(p);
}
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
@@ -6731,6 +6740,12 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
__alignof__(struct bpf_iter_scx_dsq));
+ /*
+ * next() and destroy() will be called regardless of the return value.
+ * Always clear $kit->dsq.
+ */
+ kit->dsq = NULL;
+
if (flags & ~__SCX_DSQ_ITER_USER_FLAGS)
return -EINVAL;
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 1079b56b0f7a..67032c30c754 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -70,6 +70,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
#ifdef CONFIG_CGROUP_SCHED
#ifdef CONFIG_EXT_GROUP_SCHED
+void scx_tg_init(struct task_group *tg);
int scx_tg_online(struct task_group *tg);
void scx_tg_offline(struct task_group *tg);
int scx_cgroup_can_attach(struct cgroup_taskset *tset);
@@ -79,6 +80,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
void scx_group_set_idle(struct task_group *tg, bool idle);
#else /* CONFIG_EXT_GROUP_SCHED */
+static inline void scx_tg_init(struct task_group *tg) {}
static inline int scx_tg_online(struct task_group *tg) { return 0; }
static inline void scx_tg_offline(struct task_group *tg) {}
static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ceb023629d48..af61769b1d50 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -71,10 +71,10 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
*
- * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_base_slice = 750000ULL;
-static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
+unsigned int sysctl_sched_base_slice = 700000ULL;
+static unsigned int normalized_sysctl_sched_base_slice = 700000ULL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -2147,7 +2147,7 @@ static void update_numa_stats(struct task_numa_env *env,
ns->load += cpu_load(rq);
ns->runnable += cpu_runnable(rq);
ns->util += cpu_util_cfs(cpu);
- ns->nr_running += rq->cfs.h_nr_running;
+ ns->nr_running += rq->cfs.h_nr_queued;
ns->compute_capacity += capacity_of(cpu);
if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
@@ -5427,7 +5427,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
- * h_nr_running of its group cfs_rq.
+ * h_nr_queued of its group cfs_rq.
* - For group_entity, update its weight to reflect the new share of
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
@@ -5511,6 +5511,7 @@ static void set_delayed(struct sched_entity *se)
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_nr_runnable--;
cfs_rq->h_nr_delayed++;
if (cfs_rq_throttled(cfs_rq))
break;
@@ -5533,6 +5534,7 @@ static void clear_delayed(struct sched_entity *se)
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_nr_runnable++;
cfs_rq->h_nr_delayed--;
if (cfs_rq_throttled(cfs_rq))
break;
@@ -5583,7 +5585,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
- * h_nr_running of its group cfs_rq.
+ * h_nr_queued of its group cfs_rq.
* - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
@@ -5985,8 +5987,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, idle_task_delta, delayed_delta, dequeue = 1;
- long rq_h_nr_running = rq->cfs.h_nr_running;
+ long queued_delta, runnable_delta, idle_task_delta, delayed_delta, dequeue = 1;
+ long rq_h_nr_queued = rq->cfs.h_nr_queued;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@@ -6016,7 +6018,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- task_delta = cfs_rq->h_nr_running;
+ queued_delta = cfs_rq->h_nr_queued;
+ runnable_delta = cfs_rq->h_nr_runnable;
idle_task_delta = cfs_rq->idle_h_nr_running;
delayed_delta = cfs_rq->h_nr_delayed;
for_each_sched_entity(se) {
@@ -6038,9 +6041,10 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
dequeue_entity(qcfs_rq, se, flags);
if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
+ idle_task_delta = cfs_rq->h_nr_queued;
- qcfs_rq->h_nr_running -= task_delta;
+ qcfs_rq->h_nr_queued -= queued_delta;
+ qcfs_rq->h_nr_runnable -= runnable_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
qcfs_rq->h_nr_delayed -= delayed_delta;
@@ -6061,18 +6065,19 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
se_update_runnable(se);
if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
+ idle_task_delta = cfs_rq->h_nr_queued;
- qcfs_rq->h_nr_running -= task_delta;
+ qcfs_rq->h_nr_queued -= queued_delta;
+ qcfs_rq->h_nr_runnable -= runnable_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
qcfs_rq->h_nr_delayed -= delayed_delta;
}
/* At this point se is NULL and we are at root level*/
- sub_nr_running(rq, task_delta);
+ sub_nr_running(rq, queued_delta);
/* Stop the fair server if throttling resulted in no runnable tasks */
- if (rq_h_nr_running && !rq->cfs.h_nr_running)
+ if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
dl_server_stop(&rq->fair_server);
done:
/*
@@ -6091,8 +6096,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, idle_task_delta, delayed_delta;
- long rq_h_nr_running = rq->cfs.h_nr_running;
+ long queued_delta, runnable_delta, idle_task_delta, delayed_delta;
+ long rq_h_nr_queued = rq->cfs.h_nr_queued;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -6125,7 +6130,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
goto unthrottle_throttle;
}
- task_delta = cfs_rq->h_nr_running;
+ queued_delta = cfs_rq->h_nr_queued;
+ runnable_delta = cfs_rq->h_nr_runnable;
idle_task_delta = cfs_rq->idle_h_nr_running;
delayed_delta = cfs_rq->h_nr_delayed;
for_each_sched_entity(se) {
@@ -6141,9 +6147,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
+ idle_task_delta = cfs_rq->h_nr_queued;
- qcfs_rq->h_nr_running += task_delta;
+ qcfs_rq->h_nr_queued += queued_delta;
+ qcfs_rq->h_nr_runnable += runnable_delta;
qcfs_rq->idle_h_nr_running += idle_task_delta;
qcfs_rq->h_nr_delayed += delayed_delta;
@@ -6159,9 +6166,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
se_update_runnable(se);
if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
+ idle_task_delta = cfs_rq->h_nr_queued;
- qcfs_rq->h_nr_running += task_delta;
+ qcfs_rq->h_nr_queued += queued_delta;
+ qcfs_rq->h_nr_runnable += runnable_delta;
qcfs_rq->idle_h_nr_running += idle_task_delta;
qcfs_rq->h_nr_delayed += delayed_delta;
@@ -6171,11 +6179,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
}
/* Start the fair server if un-throttling resulted in new runnable tasks */
- if (!rq_h_nr_running && rq->cfs.h_nr_running)
+ if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
dl_server_start(&rq->fair_server);
/* At this point se is NULL and we are at root level*/
- add_nr_running(rq, task_delta);
+ add_nr_running(rq, queued_delta);
unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);
@@ -6890,7 +6898,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
SCHED_WARN_ON(task_rq(p) != rq);
- if (rq->cfs.h_nr_running > 1) {
+ if (rq->cfs.h_nr_queued > 1) {
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
u64 slice = se->slice;
s64 delta = slice - ran;
@@ -7033,7 +7041,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int idle_h_nr_running = task_has_idle_policy(p);
int h_nr_delayed = 0;
int task_new = !(flags & ENQUEUE_WAKEUP);
- int rq_h_nr_running = rq->cfs.h_nr_running;
+ int rq_h_nr_queued = rq->cfs.h_nr_queued;
u64 slice = 0;
/*
@@ -7081,7 +7089,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
enqueue_entity(cfs_rq, se, flags);
slice = cfs_rq_min_slice(cfs_rq);
- cfs_rq->h_nr_running++;
+ if (!h_nr_delayed)
+ cfs_rq->h_nr_runnable++;
+ cfs_rq->h_nr_queued++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
cfs_rq->h_nr_delayed += h_nr_delayed;
@@ -7107,7 +7117,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
min_vruntime_cb_propagate(&se->run_node, NULL);
slice = cfs_rq_min_slice(cfs_rq);
- cfs_rq->h_nr_running++;
+ if (!h_nr_delayed)
+ cfs_rq->h_nr_runnable++;
+ cfs_rq->h_nr_queued++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
cfs_rq->h_nr_delayed += h_nr_delayed;
@@ -7119,7 +7131,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
goto enqueue_throttle;
}
- if (!rq_h_nr_running && rq->cfs.h_nr_running) {
+ if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
/* Account for idle runtime */
if (!rq->nr_running)
dl_server_update_idle_time(rq, rq->curr);
@@ -7166,25 +7178,22 @@ static void set_next_buddy(struct sched_entity *se);
static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
{
bool was_sched_idle = sched_idle_rq(rq);
- int rq_h_nr_running = rq->cfs.h_nr_running;
+ int rq_h_nr_queued = rq->cfs.h_nr_queued;
bool task_sleep = flags & DEQUEUE_SLEEP;
bool task_delayed = flags & DEQUEUE_DELAYED;
struct task_struct *p = NULL;
int idle_h_nr_running = 0;
- int h_nr_running = 0;
+ int h_nr_queued = 0;
int h_nr_delayed = 0;
struct cfs_rq *cfs_rq;
u64 slice = 0;
if (entity_is_task(se)) {
p = task_of(se);
- h_nr_running = 1;
+ h_nr_queued = 1;
idle_h_nr_running = task_has_idle_policy(p);
if (!task_sleep && !task_delayed)
h_nr_delayed = !!se->sched_delayed;
- } else {
- cfs_rq = group_cfs_rq(se);
- slice = cfs_rq_min_slice(cfs_rq);
}
for_each_sched_entity(se) {
@@ -7194,15 +7203,18 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (p && &p->se == se)
return -1;
+ slice = cfs_rq_min_slice(cfs_rq);
break;
}
- cfs_rq->h_nr_running -= h_nr_running;
+ if (!h_nr_delayed)
+ cfs_rq->h_nr_runnable -= h_nr_queued;
+ cfs_rq->h_nr_queued -= h_nr_queued;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
cfs_rq->h_nr_delayed -= h_nr_delayed;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = h_nr_running;
+ idle_h_nr_running = h_nr_queued;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
@@ -7238,21 +7250,23 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
min_vruntime_cb_propagate(&se->run_node, NULL);
slice = cfs_rq_min_slice(cfs_rq);
- cfs_rq->h_nr_running -= h_nr_running;
+ if (!h_nr_delayed)
+ cfs_rq->h_nr_runnable -= h_nr_queued;
+ cfs_rq->h_nr_queued -= h_nr_queued;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
cfs_rq->h_nr_delayed -= h_nr_delayed;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = h_nr_running;
+ idle_h_nr_running = h_nr_queued;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
return 0;
}
- sub_nr_running(rq, h_nr_running);
+ sub_nr_running(rq, h_nr_queued);
- if (rq_h_nr_running && !rq->cfs.h_nr_running)
+ if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
dl_server_stop(&rq->fair_server);
/* balance early to pull high priority tasks */
@@ -7299,6 +7313,11 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
return true;
}
+static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
+{
+ return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable);
+}
+
#ifdef CONFIG_SMP
/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
@@ -7460,8 +7479,12 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
- if (sync && cpu_rq(this_cpu)->nr_running == 1)
- return this_cpu;
+ if (sync) {
+ struct rq *rq = cpu_rq(this_cpu);
+
+ if ((rq->nr_running - cfs_h_nr_delayed(rq)) == 1)
+ return this_cpu;
+ }
if (available_idle_cpu(prev_cpu))
return prev_cpu;
@@ -10396,7 +10419,7 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
* When there is more than 1 task, the group_overloaded case already
* takes care of cpu with reduced capacity
*/
- if (rq->cfs.h_nr_running != 1)
+ if (rq->cfs.h_nr_queued != 1)
return false;
return check_cpu_capacity(rq, sd);
@@ -10431,7 +10454,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_load += load;
sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
- sgs->sum_h_nr_running += rq->cfs.h_nr_running;
+ sgs->sum_h_nr_running += rq->cfs.h_nr_queued;
nr_running = rq->nr_running;
sgs->sum_nr_running += nr_running;
@@ -10746,7 +10769,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
sgs->group_util += cpu_util_without(i, p);
sgs->group_runnable += cpu_runnable_without(rq, p);
local = task_running_on_cpu(i, p);
- sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
+ sgs->sum_h_nr_running += rq->cfs.h_nr_queued - local;
nr_running = rq->nr_running - local;
sgs->sum_nr_running += nr_running;
@@ -11528,7 +11551,7 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
if (rt > env->fbq_type)
continue;
- nr_running = rq->cfs.h_nr_running;
+ nr_running = rq->cfs.h_nr_queued;
if (!nr_running)
continue;
@@ -11687,7 +11710,7 @@ static int need_active_balance(struct lb_env *env)
* available on dst_cpu.
*/
if (env->idle &&
- (env->src_rq->cfs.h_nr_running == 1)) {
+ (env->src_rq->cfs.h_nr_queued == 1)) {
if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
return 1;
@@ -12197,8 +12220,14 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
/*
* Track max cost of a domain to make sure to not delay the
* next wakeup on the CPU.
+ *
+ * sched_balance_newidle() bumps the cost whenever newidle
+ * balance fails, and we don't want things to grow out of
+ * control. Use the sysctl_sched_migration_cost as the upper
+ * limit, plus a litle extra to avoid off by ones.
*/
- sd->max_newidle_lb_cost = cost;
+ sd->max_newidle_lb_cost =
+ min(cost, sysctl_sched_migration_cost + 200);
sd->last_decay_max_lb_cost = jiffies;
} else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
/*
@@ -12430,7 +12459,7 @@ static void nohz_balancer_kick(struct rq *rq)
* If there's a runnable CFS task and the current CPU has reduced
* capacity, kick the ILB to see if there's a better CPU to run on:
*/
- if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
+ if (rq->cfs.h_nr_queued >= 1 && check_cpu_capacity(rq, sd)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
@@ -12903,10 +12932,17 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
t1 = sched_clock_cpu(this_cpu);
domain_cost = t1 - t0;
- update_newidle_cost(sd, domain_cost);
-
curr_cost += domain_cost;
t0 = t1;
+
+ /*
+ * Failing newidle means it is not effective;
+ * bump the cost so we end up doing less of it.
+ */
+ if (!pulled_task)
+ domain_cost = (3 * sd->max_newidle_lb_cost) / 2;
+
+ update_newidle_cost(sd, domain_cost);
}
/*
@@ -12928,11 +12964,11 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
* have been enqueued in the meantime. Since we're not going idle,
* pretend we pulled a task.
*/
- if (this_rq->cfs.h_nr_running && !pulled_task)
+ if (this_rq->cfs.h_nr_queued && !pulled_task)
pulled_task = 1;
/* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+ if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
pulled_task = -1;
out:
@@ -13619,7 +13655,7 @@ int sched_group_set_idle(struct task_group *tg, long idle)
parent_cfs_rq->idle_nr_running--;
}
- idle_task_delta = grp_cfs_rq->h_nr_running -
+ idle_task_delta = grp_cfs_rq->h_nr_queued -
grp_cfs_rq->idle_h_nr_running;
if (!cfs_rq_is_idle(grp_cfs_rq))
idle_task_delta *= -1;
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index c48900b856a2..52ca8e268cfc 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -80,7 +80,7 @@ long calc_load_fold_active(struct rq *this_rq, long adjust)
long nr_active, delta = 0;
nr_active = this_rq->nr_running - adjust;
- nr_active += (int)this_rq->nr_uninterruptible;
+ nr_active += (long)this_rq->nr_uninterruptible;
if (nr_active != this_rq->calc_load_active) {
delta = nr_active - this_rq->calc_load_active;
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 171a802420a1..8189a35e53fe 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -275,7 +275,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
*
* group: [ see update_cfs_group() ]
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
- * se_runnable() = grq->h_nr_running
+ * se_runnable() = grq->h_nr_queued
*
* runnable_sum = se_runnable() * runnable = grq->runnable_sum
* runnable_avg = runnable_sum
@@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
{
if (___update_load_sum(now, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
- cfs_rq->h_nr_running - cfs_rq->h_nr_delayed,
+ cfs_rq->h_nr_queued - cfs_rq->h_nr_delayed,
cfs_rq->curr != NULL)) {
___update_load_avg(&cfs_rq->avg, 1);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 84dad1511d1e..7d0f8fdd48a3 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -172,17 +172,35 @@ struct psi_group psi_system = {
.pcpu = &system_group_pcpu,
};
+static DEFINE_PER_CPU(seqcount_t, psi_seq) = SEQCNT_ZERO(psi_seq);
+
+static inline void psi_write_begin(int cpu)
+{
+ write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline void psi_write_end(int cpu)
+{
+ write_seqcount_end(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline u32 psi_read_begin(int cpu)
+{
+ return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline bool psi_read_retry(int cpu, u32 seq)
+{
+ return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq);
+}
+
static void psi_avgs_work(struct work_struct *work);
static void poll_timer_fn(struct timer_list *t);
static void group_init(struct psi_group *group)
{
- int cpu;
-
group->enabled = true;
- for_each_possible_cpu(cpu)
- seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
group->avg_last_update = sched_clock();
group->avg_next_update = group->avg_last_update + psi_period;
mutex_init(&group->avgs_lock);
@@ -262,14 +280,14 @@ static void get_recent_times(struct psi_group *group, int cpu,
/* Snapshot a coherent view of the CPU state */
do {
- seq = read_seqcount_begin(&groupc->seq);
+ seq = psi_read_begin(cpu);
now = cpu_clock(cpu);
memcpy(times, groupc->times, sizeof(groupc->times));
state_mask = groupc->state_mask;
state_start = groupc->state_start;
if (cpu == current_cpu)
memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
- } while (read_seqcount_retry(&groupc->seq, seq));
+ } while (psi_read_retry(cpu, seq));
/* Calculate state time deltas against the previous snapshot */
for (s = 0; s < NR_PSI_STATES; s++) {
@@ -768,31 +786,21 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
groupc->times[PSI_NONIDLE] += delta;
}
+#define for_each_group(iter, group) \
+ for (typeof(group) iter = group; iter; iter = iter->parent)
+
static void psi_group_change(struct psi_group *group, int cpu,
unsigned int clear, unsigned int set,
- bool wake_clock)
+ u64 now, bool wake_clock)
{
struct psi_group_cpu *groupc;
unsigned int t, m;
u32 state_mask;
- u64 now;
lockdep_assert_rq_held(cpu_rq(cpu));
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
- * First we update the task counts according to the state
- * change requested through the @clear and @set bits.
- *
- * Then if the cgroup PSI stats accounting enabled, we
- * assess the aggregate resource states this CPU's tasks
- * have been in since the last change, and account any
- * SOME and FULL time these may have resulted in.
- */
- write_seqcount_begin(&groupc->seq);
- now = cpu_clock(cpu);
-
- /*
* Start with TSK_ONCPU, which doesn't have a corresponding
* task count - it's just a boolean flag directly encoded in
* the state mask. Clear, set, or carry the current state if
@@ -843,7 +851,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
groupc->state_mask = state_mask;
- write_seqcount_end(&groupc->seq);
return;
}
@@ -864,8 +871,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
groupc->state_mask = state_mask;
- write_seqcount_end(&groupc->seq);
-
if (state_mask & group->rtpoll_states)
psi_schedule_rtpoll_work(group, 1, false);
@@ -900,24 +905,29 @@ static void psi_flags_change(struct task_struct *task, int clear, int set)
void psi_task_change(struct task_struct *task, int clear, int set)
{
int cpu = task_cpu(task);
- struct psi_group *group;
+ u64 now;
if (!task->pid)
return;
psi_flags_change(task, clear, set);
- group = task_psi_group(task);
- do {
- psi_group_change(group, cpu, clear, set, true);
- } while ((group = group->parent));
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+ for_each_group(group, task_psi_group(task))
+ psi_group_change(group, cpu, clear, set, now, true);
+ psi_write_end(cpu);
}
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep)
{
- struct psi_group *group, *common = NULL;
+ struct psi_group *common = NULL;
int cpu = task_cpu(prev);
+ u64 now;
+
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
if (next->pid) {
psi_flags_change(next, 0, TSK_ONCPU);
@@ -926,16 +936,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
* ancestors with @prev, those will already have @prev's
* TSK_ONCPU bit set, and we can stop the iteration there.
*/
- group = task_psi_group(next);
- do {
- if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
- PSI_ONCPU) {
+ for_each_group(group, task_psi_group(next)) {
+ struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+
+ if (groupc->state_mask & PSI_ONCPU) {
common = group;
break;
}
-
- psi_group_change(group, cpu, 0, TSK_ONCPU, true);
- } while ((group = group->parent));
+ psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
+ }
}
if (prev->pid) {
@@ -968,12 +977,11 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
psi_flags_change(prev, clear, set);
- group = task_psi_group(prev);
- do {
+ for_each_group(group, task_psi_group(prev)) {
if (group == common)
break;
- psi_group_change(group, cpu, clear, set, wake_clock);
- } while ((group = group->parent));
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
+ }
/*
* TSK_ONCPU is handled up to the common ancestor. If there are
@@ -983,20 +991,21 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
*/
if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
clear &= ~TSK_ONCPU;
- for (; group; group = group->parent)
- psi_group_change(group, cpu, clear, set, wake_clock);
+ for_each_group(group, common)
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
}
}
+ psi_write_end(cpu);
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
{
int cpu = task_cpu(curr);
- struct psi_group *group;
struct psi_group_cpu *groupc;
s64 delta;
u64 irq;
+ u64 now;
if (static_branch_likely(&psi_disabled))
return;
@@ -1005,8 +1014,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
return;
lockdep_assert_rq_held(rq);
- group = task_psi_group(curr);
- if (prev && task_psi_group(prev) == group)
+ if (prev && task_psi_group(prev) == task_psi_group(curr))
return;
irq = irq_time_read(cpu);
@@ -1015,25 +1023,22 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
return;
rq->psi_irq_time = irq;
- do {
- u64 now;
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+ for_each_group(group, task_psi_group(curr)) {
if (!group->enabled)
continue;
groupc = per_cpu_ptr(group->pcpu, cpu);
- write_seqcount_begin(&groupc->seq);
- now = cpu_clock(cpu);
-
record_times(groupc, now);
groupc->times[PSI_IRQ_FULL] += delta;
- write_seqcount_end(&groupc->seq);
-
if (group->rtpoll_states & (1 << PSI_IRQ_FULL))
psi_schedule_rtpoll_work(group, 1, false);
- } while ((group = group->parent));
+ }
+ psi_write_end(cpu);
}
#endif
@@ -1221,12 +1226,14 @@ void psi_cgroup_restart(struct psi_group *group)
return;
for_each_possible_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
+ u64 now;
- rq_lock_irq(rq, &rf);
- psi_group_change(group, cpu, 0, 0, true);
- rq_unlock_irq(rq, &rf);
+ guard(rq_lock_irq)(cpu_rq(cpu));
+
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+ psi_group_change(group, cpu, 0, 0, now, true);
+ psi_write_end(cpu);
}
}
#endif /* CONFIG_CGROUPS */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 172c588de542..6ad6717084ed 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2951,6 +2951,12 @@ undo:
}
mutex_unlock(&mutex);
+ /*
+ * After changing maximum available bandwidth for DEADLINE, we need to
+ * recompute per root domain and per cpus variables accordingly.
+ */
+ rebuild_sched_domains();
+
return ret;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d79de755c1c2..a441990fe808 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -651,7 +651,8 @@ struct balance_callback {
struct cfs_rq {
struct load_weight load;
unsigned int nr_running;
- unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
unsigned int idle_nr_running; /* SCHED_IDLE */
unsigned int idle_h_nr_running; /* SCHED_IDLE */
unsigned int h_nr_delayed;
@@ -907,7 +908,7 @@ static inline void se_update_runnable(struct sched_entity *se)
if (!entity_is_task(se)) {
struct cfs_rq *cfs_rq = se->my_q;
- se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed;
+ se->runnable_weight = cfs_rq->h_nr_queued - cfs_rq->h_nr_delayed;
}
}
@@ -1155,7 +1156,7 @@ struct rq {
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
- unsigned int nr_uninterruptible;
+ unsigned long nr_uninterruptible;
struct task_struct __rcu *curr;
struct sched_dl_entity *dl_server;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9748a4c8d668..4bd825c24e26 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2174,6 +2174,8 @@ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
goto unlock;
hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
+ if (!hop_masks)
+ goto unlock;
hop = hop_masks - k.masks;
ret = hop ?
diff --git a/kernel/signal.c b/kernel/signal.c
index 2ae45e6eb6bb..468b589c39e6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2063,8 +2063,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
WARN_ON_ONCE(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
/*
- * tsk is a group leader and has no threads, wake up the
- * non-PIDFD_THREAD waiters.
+ * Notify for thread-group leaders without subthreads.
*/
if (thread_group_empty(tsk))
do_notify_pidfd(tsk);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8c4524ce65fa..00ff17635041 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -126,6 +126,18 @@ static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = {
.lock = INIT_LOCAL_LOCK(softirq_ctrl.lock),
};
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key bh_lock_key;
+struct lockdep_map bh_lock_map = {
+ .name = "local_bh",
+ .key = &bh_lock_key,
+ .wait_type_outer = LD_WAIT_FREE,
+ .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT makes BH preemptible. */
+ .lock_type = LD_LOCK_PERCPU,
+};
+EXPORT_SYMBOL_GPL(bh_lock_map);
+#endif
+
/**
* local_bh_blocked() - Check for idle whether BH processing is blocked
*
@@ -148,6 +160,8 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
WARN_ON_ONCE(in_hardirq());
+ lock_map_acquire_read(&bh_lock_map);
+
/* First entry of a task into a BH disabled section? */
if (!current->softirq_disable_cnt) {
if (preemptible()) {
@@ -211,6 +225,8 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
WARN_ON_ONCE(in_hardirq());
lockdep_assert_irqs_enabled();
+ lock_map_release(&bh_lock_map);
+
local_irq_save(flags);
curcnt = __this_cpu_read(softirq_ctrl.cnt);
@@ -261,6 +277,8 @@ static inline void ksoftirqd_run_begin(void)
/* Counterpart to ksoftirqd_run_begin() */
static inline void ksoftirqd_run_end(void)
{
+ /* pairs with the lock_map_acquire_read() in ksoftirqd_run_begin() */
+ lock_map_release(&bh_lock_map);
__local_bh_enable(SOFTIRQ_OFFSET, true);
WARN_ON_ONCE(in_interrupt());
local_irq_enable();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index da821ce258ea..d758e66ad59e 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -82,18 +82,15 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done)
}
static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
- struct cpu_stop_work *work,
- struct wake_q_head *wakeq)
+ struct cpu_stop_work *work)
{
list_add_tail(&work->list, &stopper->works);
- wake_q_add(wakeq, stopper->thread);
}
/* queue @work to @stopper. if offline, @work is completed immediately */
static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- DEFINE_WAKE_Q(wakeq);
unsigned long flags;
bool enabled;
@@ -101,12 +98,13 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
raw_spin_lock_irqsave(&stopper->lock, flags);
enabled = stopper->enabled;
if (enabled)
- __cpu_stop_queue_work(stopper, work, &wakeq);
+ __cpu_stop_queue_work(stopper, work);
else if (work->done)
cpu_stop_signal_done(work->done);
raw_spin_unlock_irqrestore(&stopper->lock, flags);
- wake_up_q(&wakeq);
+ if (enabled)
+ wake_up_process(stopper->thread);
preempt_enable();
return enabled;
@@ -263,7 +261,6 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
{
struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
- DEFINE_WAKE_Q(wakeq);
int err;
retry:
@@ -299,8 +296,8 @@ retry:
}
err = 0;
- __cpu_stop_queue_work(stopper1, work1, &wakeq);
- __cpu_stop_queue_work(stopper2, work2, &wakeq);
+ __cpu_stop_queue_work(stopper1, work1);
+ __cpu_stop_queue_work(stopper2, work2);
unlock:
raw_spin_unlock(&stopper2->lock);
@@ -315,7 +312,10 @@ unlock:
goto retry;
}
- wake_up_q(&wakeq);
+ if (!err) {
+ wake_up_process(stopper1->thread);
+ wake_up_process(stopper2->thread);
+ }
preempt_enable();
return err;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 58fb7280cabb..ae862ad9642c 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -302,7 +302,7 @@ static void clocksource_verify_choose_cpus(void)
{
int cpu, i, n = verify_n_cpus;
- if (n < 0) {
+ if (n < 0 || n >= num_online_cpus()) {
/* Check all of the CPUs. */
cpumask_copy(&cpus_chosen, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index db9c06bb2311..cd9cb7ccb540 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -117,16 +117,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.csd = CSD_INIT(retrigger_next_event, NULL)
};
-static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
- /* Make sure we catch unsupported clockids */
- [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES,
-
- [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
- [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
- [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
- [CLOCK_TAI] = HRTIMER_BASE_TAI,
-};
-
static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
{
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
@@ -808,10 +798,10 @@ static void retrigger_next_event(void *arg)
* of the next expiring timer is enough. The return from the SMP
* function call will take care of the reprogramming in case the
* CPU was in a NOHZ idle sleep.
+ *
+ * In periodic low resolution mode, the next softirq expiration
+ * must also be updated.
*/
- if (!hrtimer_hres_active(base) && !tick_nohz_active)
- return;
-
raw_spin_lock(&base->lock);
hrtimer_update_base(base);
if (hrtimer_hres_active(base))
@@ -1597,14 +1587,19 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude)
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
- if (likely(clock_id < MAX_CLOCKS)) {
- int base = hrtimer_clock_to_base_table[clock_id];
-
- if (likely(base != HRTIMER_MAX_CLOCK_BASES))
- return base;
+ switch (clock_id) {
+ case CLOCK_REALTIME:
+ return HRTIMER_BASE_REALTIME;
+ case CLOCK_MONOTONIC:
+ return HRTIMER_BASE_MONOTONIC;
+ case CLOCK_BOOTTIME:
+ return HRTIMER_BASE_BOOTTIME;
+ case CLOCK_TAI:
+ return HRTIMER_BASE_TAI;
+ default:
+ WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
+ return HRTIMER_BASE_MONOTONIC;
}
- WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
- return HRTIMER_BASE_MONOTONIC;
}
static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
@@ -2291,11 +2286,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
&new_base->clock_base[i]);
}
- /*
- * The migration might have changed the first expiring softirq
- * timer on this CPU. Update it.
- */
- __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
/* Tell the other CPU to retrigger the next event */
smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 6bcee4704059..d44641108ba8 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1401,6 +1401,15 @@ void run_posix_cpu_timers(void)
lockdep_assert_irqs_disabled();
/*
+ * Ensure that release_task(tsk) can't happen while
+ * handle_posix_cpu_timers() is running. Otherwise, a concurrent
+ * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and
+ * miss timer->it.cpu.firing != 0.
+ */
+ if (tsk->exit_state)
+ return;
+
+ /*
* If the actual expiry is deferred to task work context and the
* work is already scheduled there is no point to do anything here.
*/
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 4576aaed13b2..c5d9bfbb082b 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -118,6 +118,7 @@ static int posix_timer_add(struct k_itimer *timer)
return id;
}
spin_unlock(&hash_lock);
+ cond_resched();
}
/* POSIX return code when no timer ID could be allocated */
return -EAGAIN;
@@ -513,14 +514,21 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
if (error)
goto out;
- spin_lock_irq(&current->sighand->siglock);
- /* This makes the timer valid in the hash table */
- WRITE_ONCE(new_timer->it_signal, current->signal);
- hlist_add_head(&new_timer->list, &current->signal->posix_timers);
- spin_unlock_irq(&current->sighand->siglock);
/*
- * After unlocking sighand::siglock @new_timer is subject to
- * concurrent removal and cannot be touched anymore
+ * timer::it_lock ensures that __lock_timer() observes a fully
+ * initialized timer when it observes a valid timer::it_signal.
+ *
+ * sighand::siglock is required to protect signal::posix_timers.
+ */
+ scoped_guard (spinlock_irq, &new_timer->it_lock) {
+ guard(spinlock)(&current->sighand->siglock);
+ /* This makes the timer valid in the hash table */
+ WRITE_ONCE(new_timer->it_signal, current->signal);
+ hlist_add_head(&new_timer->list, &current->signal->posix_timers);
+ }
+ /*
+ * After unlocking @new_timer is subject to concurrent removal and
+ * cannot be touched anymore
*/
return 0;
out:
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index a47bcf71defc..9a3859443c04 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -509,6 +509,7 @@ void tick_resume(void)
#ifdef CONFIG_SUSPEND
static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
+static DEFINE_WAIT_OVERRIDE_MAP(tick_freeze_map, LD_WAIT_SLEEP);
static unsigned int tick_freeze_depth;
/**
@@ -528,9 +529,22 @@ void tick_freeze(void)
if (tick_freeze_depth == num_online_cpus()) {
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), true);
+ /*
+ * All other CPUs have their interrupts disabled and are
+ * suspended to idle. Other tasks have been frozen so there
+ * is no scheduling happening. This means that there is no
+ * concurrency in the system at this point. Therefore it is
+ * okay to acquire a sleeping lock on PREEMPT_RT, such as a
+ * spinlock, because the lock cannot be held by other CPUs
+ * or threads and acquiring it cannot block.
+ *
+ * Inform lockdep about the situation.
+ */
+ lock_map_acquire_try(&tick_freeze_map);
system_state = SYSTEM_SUSPEND;
sched_clock_suspend();
timekeeping_suspend();
+ lock_map_release(&tick_freeze_map);
} else {
tick_suspend_local();
}
@@ -552,8 +566,16 @@ void tick_unfreeze(void)
raw_spin_lock(&tick_freeze_lock);
if (tick_freeze_depth == num_online_cpus()) {
+ /*
+ * Similar to tick_freeze(). On resumption the first CPU may
+ * acquire uncontended sleeping locks while other CPUs block on
+ * tick_freeze_lock.
+ */
+ lock_map_acquire_try(&tick_freeze_map);
timekeeping_resume();
sched_clock_resume();
+ lock_map_release(&tick_freeze_map);
+
system_state = SYSTEM_RUNNING;
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), false);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 96933082431f..e3896b2be453 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1218,7 +1218,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
struct system_time_snapshot *history_begin,
struct system_device_crosststamp *xtstamp)
{
- struct system_counterval_t system_counterval;
+ struct system_counterval_t system_counterval = {};
struct timekeeper *tk = &tk_core.timekeeper;
u64 cycles, now, interval_start;
unsigned int clock_was_set_seq = 0;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1c311c46da50..cfbb46cc4e76 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -46,7 +46,7 @@ static void
print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
int idx, u64 now)
{
- SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function);
+ SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, timer->function);
SEQ_printf(m, ", S:%02x", timer->state);
SEQ_printf(m, "\n");
SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
@@ -98,7 +98,7 @@ next_one:
static void
print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
{
- SEQ_printf(m, " .base: %pK\n", base);
+ SEQ_printf(m, " .base: %p\n", base);
SEQ_printf(m, " .index: %d\n", base->index);
SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 55f279ddfd63..3ec7df7dbeec 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -403,7 +403,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
.arg2_type = ARG_CONST_SIZE,
};
-static void __set_printk_clr_event(void)
+static void __set_printk_clr_event(struct work_struct *work)
{
/*
* This program might be calling bpf_trace_printk,
@@ -416,10 +416,11 @@ static void __set_printk_clr_event(void)
if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1))
pr_warn_ratelimited("could not enable bpf_trace_printk events");
}
+static DECLARE_WORK(set_printk_work, __set_printk_clr_event);
const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
{
- __set_printk_clr_event();
+ schedule_work(&set_printk_work);
return &bpf_trace_printk_proto;
}
@@ -462,7 +463,7 @@ static const struct bpf_func_proto bpf_trace_vprintk_proto = {
const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void)
{
- __set_printk_clr_event();
+ schedule_work(&set_printk_work);
return &bpf_trace_vprintk_proto;
}
@@ -1827,7 +1828,7 @@ static struct pt_regs *get_bpf_raw_tp_regs(void)
struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
- if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
+ if (nest_level > ARRAY_SIZE(tp_regs->regs)) {
this_cpu_dec(bpf_raw_tp_nest_level);
return ERR_PTR(-EBUSY);
}
@@ -2931,6 +2932,9 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (sizeof(u64) != sizeof(void *))
return -EOPNOTSUPP;
+ if (attr->link_create.flags)
+ return -EINVAL;
+
if (!is_kprobe_multi(prog))
return -EINVAL;
@@ -3345,7 +3349,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
}
if (pid) {
+ rcu_read_lock();
task = get_pid_task(find_vpid(pid), PIDTYPE_TGID);
+ rcu_read_unlock();
if (!task) {
err = -ESRCH;
goto error_path_put;
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index c12335499ec9..988a4c4ba97b 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -1316,6 +1316,8 @@ error:
ftrace_graph_active--;
gops->saved_func = NULL;
fgraph_lru_release_index(i);
+ if (!ftrace_graph_active)
+ unregister_pm_notifier(&ftrace_suspend_notifier);
}
return ret;
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 90b59c627bb8..370cde32c696 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4569,13 +4569,17 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
} else {
iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
}
+ } else {
+ if (hash)
+ iter->hash = alloc_and_copy_ftrace_hash(hash->size_bits, hash);
+ else
+ iter->hash = EMPTY_HASH;
+ }
- if (!iter->hash) {
- trace_parser_put(&iter->parser);
- goto out_unlock;
- }
- } else
- iter->hash = hash;
+ if (!iter->hash) {
+ trace_parser_put(&iter->parser);
+ goto out_unlock;
+ }
ret = 0;
@@ -5944,9 +5948,10 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
/* Make a copy hash to place the new and the old entries in */
size = hash->count + direct_functions->count;
- if (size > 32)
- size = 32;
- new_hash = alloc_ftrace_hash(fls(size));
+ size = fls(size);
+ if (size > FTRACE_HASH_MAX_BITS)
+ size = FTRACE_HASH_MAX_BITS;
+ new_hash = alloc_ftrace_hash(size);
if (!new_hash)
goto out_unlock;
@@ -6444,9 +6449,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
ftrace_hash_move_and_update_ops(iter->ops, orig_hash,
iter->hash, filter_hash);
mutex_unlock(&ftrace_lock);
- } else {
- /* For read only, the hash is the ops hash */
- iter->hash = NULL;
}
mutex_unlock(&iter->ops->func_hash->regex_lock);
@@ -7294,9 +7296,10 @@ void ftrace_release_mod(struct module *mod)
mutex_lock(&ftrace_lock);
- if (ftrace_disabled)
- goto out_unlock;
-
+ /*
+ * To avoid the UAF problem after the module is unloaded, the
+ * 'mod_map' resource needs to be released unconditionally.
+ */
list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) {
if (mod_map->mod == mod) {
list_del_rcu(&mod_map->list);
@@ -7305,6 +7308,9 @@ void ftrace_release_mod(struct module *mod)
}
}
+ if (ftrace_disabled)
+ goto out_unlock;
+
/*
* Each module has its own ftrace_pages, remove
* them from the list.
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index 314ffc143039..acb0c971a408 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -117,12 +117,15 @@ static int preemptirq_delay_run(void *data)
{
int i;
int s = MIN(burst_size, NR_TEST_FUNCS);
- struct cpumask cpu_mask;
+ cpumask_var_t cpu_mask;
+
+ if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
if (cpu_affinity > -1) {
- cpumask_clear(&cpu_mask);
- cpumask_set_cpu(cpu_affinity, &cpu_mask);
- if (set_cpus_allowed_ptr(current, &cpu_mask))
+ cpumask_clear(cpu_mask);
+ cpumask_set_cpu(cpu_affinity, cpu_mask);
+ if (set_cpus_allowed_ptr(current, cpu_mask))
pr_err("cpu_affinity:%d, failed\n", cpu_affinity);
}
@@ -139,6 +142,8 @@ static int preemptirq_delay_run(void *data)
__set_current_state(TASK_RUNNING);
+ free_cpumask_var(cpu_mask);
+
return 0;
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e1ffbed8cc5e..95641a46db4c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1832,10 +1832,12 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
head_page = cpu_buffer->head_page;
- /* If both the head and commit are on the reader_page then we are done. */
- if (head_page == cpu_buffer->reader_page &&
- head_page == cpu_buffer->commit_page)
+ /* If the commit_buffer is the reader page, update the commit page */
+ if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) {
+ cpu_buffer->commit_page = cpu_buffer->reader_page;
+ /* Nothing more to do, the only page is the reader page */
goto done;
+ }
/* Iterate until finding the commit page */
for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
@@ -2794,6 +2796,12 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
if (nr_pages < 2)
nr_pages = 2;
+ /*
+ * Keep CPUs from coming online while resizing to synchronize
+ * with new per CPU buffers being created.
+ */
+ guard(cpus_read_lock)();
+
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
atomic_inc(&buffer->resizing);
@@ -2838,7 +2846,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
cond_resched();
}
- cpus_read_lock();
/*
* Fire off all the required work handlers
* We can't schedule on offline CPUs, but it's not necessary
@@ -2878,7 +2885,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
cpu_buffer->nr_pages_to_update = 0;
}
- cpus_read_unlock();
} else {
cpu_buffer = buffer->buffers[cpu_id];
@@ -2906,8 +2912,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
goto out_err;
}
- cpus_read_lock();
-
/* Can't run something on an offline CPU. */
if (!cpu_online(cpu_id))
rb_update_pages(cpu_buffer);
@@ -2926,7 +2930,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
}
cpu_buffer->nr_pages_to_update = 0;
- cpus_read_unlock();
}
out:
@@ -5810,24 +5813,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
EXPORT_SYMBOL_GPL(ring_buffer_consume);
/**
- * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
+ * ring_buffer_read_start - start a non consuming read of the buffer
* @buffer: The ring buffer to read from
* @cpu: The cpu buffer to iterate over
* @flags: gfp flags to use for memory allocation
*
- * This performs the initial preparations necessary to iterate
- * through the buffer. Memory is allocated, buffer resizing
- * is disabled, and the iterator pointer is returned to the caller.
- *
- * After a sequence of ring_buffer_read_prepare calls, the user is
- * expected to make at least one call to ring_buffer_read_prepare_sync.
- * Afterwards, ring_buffer_read_start is invoked to get things going
- * for real.
+ * This creates an iterator to allow non-consuming iteration through
+ * the buffer. If the buffer is disabled for writing, it will produce
+ * the same information each time, but if the buffer is still writing
+ * then the first hit of a write will cause the iteration to stop.
*
- * This overall must be paired with ring_buffer_read_finish.
+ * Must be paired with ring_buffer_read_finish.
*/
struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_iter *iter;
@@ -5853,51 +5852,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
atomic_inc(&cpu_buffer->resize_disabled);
- return iter;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
-
-/**
- * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
- *
- * All previously invoked ring_buffer_read_prepare calls to prepare
- * iterators will be synchronized. Afterwards, read_buffer_read_start
- * calls on those iterators are allowed.
- */
-void
-ring_buffer_read_prepare_sync(void)
-{
- synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
-
-/**
- * ring_buffer_read_start - start a non consuming read of the buffer
- * @iter: The iterator returned by ring_buffer_read_prepare
- *
- * This finalizes the startup of an iteration through the buffer.
- * The iterator comes from a call to ring_buffer_read_prepare and
- * an intervening ring_buffer_read_prepare_sync must have been
- * performed.
- *
- * Must be paired with ring_buffer_read_finish.
- */
-void
-ring_buffer_read_start(struct ring_buffer_iter *iter)
-{
- struct ring_buffer_per_cpu *cpu_buffer;
- unsigned long flags;
-
- if (!iter)
- return;
-
- cpu_buffer = iter->cpu_buffer;
-
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
arch_spin_lock(&cpu_buffer->lock);
rb_iter_reset(iter);
arch_spin_unlock(&cpu_buffer->lock);
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ return iter;
}
EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -6752,7 +6712,7 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
old_size = buffer->subbuf_size;
/* prevent another thread from changing buffer sizes */
- mutex_lock(&buffer->mutex);
+ guard(mutex)(&buffer->mutex);
atomic_inc(&buffer->record_disabled);
/* Make sure all commits have finished */
@@ -6857,7 +6817,6 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
}
atomic_dec(&buffer->record_disabled);
- mutex_unlock(&buffer->mutex);
return 0;
@@ -6866,7 +6825,6 @@ error:
buffer->subbuf_size = old_size;
atomic_dec(&buffer->record_disabled);
- mutex_unlock(&buffer->mutex);
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
@@ -7272,8 +7230,8 @@ consume:
/* Check if any events were dropped */
missed_events = cpu_buffer->lost_events;
- if (cpu_buffer->reader_page != cpu_buffer->commit_page) {
- if (missed_events) {
+ if (missed_events) {
+ if (cpu_buffer->reader_page != cpu_buffer->commit_page) {
struct buffer_data_page *bpage = reader->page;
unsigned int commit;
/*
@@ -7294,13 +7252,23 @@ consume:
local_add(RB_MISSED_STORED, &bpage->commit);
}
local_add(RB_MISSED_EVENTS, &bpage->commit);
+ } else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page,
+ "Reader on commit with %ld missed events",
+ missed_events)) {
+ /*
+ * There shouldn't be any missed events if the tail_page
+ * is on the reader page. But if the tail page is not on the
+ * reader page and the commit_page is, that would mean that
+ * there's a commit_overrun (an interrupt preempted an
+ * addition of an event and then filled the buffer
+ * with new events). In this case it's not an
+ * error, but it should still be reported.
+ *
+ * TODO: Add missed events to the page for user space to know.
+ */
+ pr_info("Ring buffer [%d] commit overrun lost %ld events at timestamp:%lld\n",
+ cpu, missed_events, cpu_buffer->reader_page->page->time_stamp);
}
- } else {
- /*
- * There really shouldn't be any missed events if the commit
- * is on the reader page.
- */
- WARN_ON_ONCE(missed_events);
}
cpu_buffer->lost_events = 0;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ffe1422ab03f..91e6bf1b101a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -787,7 +787,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
/* copy the current bits to the new max */
ret = trace_pid_list_first(filtered_pids, &pid);
while (!ret) {
- trace_pid_list_set(pid_list, pid);
+ ret = trace_pid_list_set(pid_list, pid);
+ if (ret < 0)
+ goto out;
+
ret = trace_pid_list_next(filtered_pids, pid + 1, &pid);
nr_pids++;
}
@@ -824,6 +827,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
trace_parser_clear(&parser);
ret = 0;
}
+ out:
trace_parser_put(&parser);
if (ret < 0) {
@@ -1754,7 +1758,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
ret = get_user(ch, ubuf++);
if (ret)
- goto out;
+ goto fail;
read++;
cnt--;
@@ -1768,7 +1772,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
while (cnt && isspace(ch)) {
ret = get_user(ch, ubuf++);
if (ret)
- goto out;
+ goto fail;
read++;
cnt--;
}
@@ -1778,8 +1782,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
/* only spaces were written */
if (isspace(ch) || !ch) {
*ppos += read;
- ret = read;
- goto out;
+ return read;
}
}
@@ -1789,11 +1792,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
parser->buffer[parser->idx++] = ch;
else {
ret = -EINVAL;
- goto out;
+ goto fail;
}
+
ret = get_user(ch, ubuf++);
if (ret)
- goto out;
+ goto fail;
read++;
cnt--;
}
@@ -1809,13 +1813,13 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
parser->buffer[parser->idx] = 0;
} else {
ret = -EINVAL;
- goto out;
+ goto fail;
}
*ppos += read;
- ret = read;
-
-out:
+ return read;
+fail:
+ trace_parser_fail(parser);
return ret;
}
@@ -2318,10 +2322,10 @@ int __init register_tracer(struct tracer *type)
mutex_unlock(&trace_types_lock);
if (ret || !default_bootup_tracer)
- goto out_unlock;
+ return ret;
if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
- goto out_unlock;
+ return 0;
printk(KERN_INFO "Starting tracer '%s'\n", type->name);
/* Do we want this tracer to start on bootup? */
@@ -2333,8 +2337,7 @@ int __init register_tracer(struct tracer *type)
/* disable other selftests, since this will break it. */
disable_tracing_selftest("running a tracer");
- out_unlock:
- return ret;
+ return 0;
}
static void tracing_reset_cpu(struct array_buffer *buf, int cpu)
@@ -3343,10 +3346,9 @@ out_nobuffer:
}
EXPORT_SYMBOL_GPL(trace_vbprintk);
-__printf(3, 0)
-static int
-__trace_array_vprintk(struct trace_buffer *buffer,
- unsigned long ip, const char *fmt, va_list args)
+static __printf(3, 0)
+int __trace_array_vprintk(struct trace_buffer *buffer,
+ unsigned long ip, const char *fmt, va_list args)
{
struct trace_event_call *call = &event_print;
struct ring_buffer_event *event;
@@ -3399,7 +3401,6 @@ out_nobuffer:
return len;
}
-__printf(3, 0)
int trace_array_vprintk(struct trace_array *tr,
unsigned long ip, const char *fmt, va_list args)
{
@@ -3429,7 +3430,6 @@ int trace_array_vprintk(struct trace_array *tr,
* Note, trace_array_init_printk() must be called on @tr before this
* can be used.
*/
-__printf(3, 0)
int trace_array_printk(struct trace_array *tr,
unsigned long ip, const char *fmt, ...)
{
@@ -3474,7 +3474,6 @@ int trace_array_init_printk(struct trace_array *tr)
}
EXPORT_SYMBOL_GPL(trace_array_init_printk);
-__printf(3, 4)
int trace_array_printk_buf(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, ...)
{
@@ -3490,7 +3489,6 @@ int trace_array_printk_buf(struct trace_buffer *buffer,
return ret;
}
-__printf(2, 0)
int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
{
return trace_array_vprintk(printk_trace, ip, fmt, args);
@@ -4653,21 +4651,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->array_buffer->buffer,
- cpu, GFP_KERNEL);
- }
- ring_buffer_read_prepare_sync();
- for_each_tracing_cpu(cpu) {
- ring_buffer_read_start(iter->buffer_iter[cpu]);
+ ring_buffer_read_start(iter->array_buffer->buffer,
+ cpu, GFP_KERNEL);
tracing_iter_reset(iter, cpu);
}
} else {
cpu = iter->cpu_file;
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->array_buffer->buffer,
- cpu, GFP_KERNEL);
- ring_buffer_read_prepare_sync();
- ring_buffer_read_start(iter->buffer_iter[cpu]);
+ ring_buffer_read_start(iter->array_buffer->buffer,
+ cpu, GFP_KERNEL);
tracing_iter_reset(iter, cpu);
}
@@ -6703,13 +6695,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
/* Copy the data into the page, so we can start over. */
ret = trace_seq_to_buffer(&iter->seq,
page_address(spd.pages[i]),
- trace_seq_used(&iter->seq));
+ min((size_t)trace_seq_used(&iter->seq),
+ (size_t)PAGE_SIZE));
if (ret < 0) {
__free_page(spd.pages[i]);
break;
}
spd.partial[i].offset = 0;
- spd.partial[i].len = trace_seq_used(&iter->seq);
+ spd.partial[i].len = ret;
trace_seq_init(&iter->seq);
}
@@ -6960,7 +6953,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
entry = ring_buffer_event_data(event);
entry->ip = _THIS_IP_;
- len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt);
+ len = copy_from_user_nofault(&entry->buf, ubuf, cnt);
if (len) {
memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
cnt = FAULTED_SIZE;
@@ -7031,7 +7024,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
entry = ring_buffer_event_data(event);
- len = __copy_from_user_inatomic(&entry->id, ubuf, cnt);
+ len = copy_from_user_nofault(&entry->id, ubuf, cnt);
if (len) {
entry->id = -1;
memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
@@ -8573,12 +8566,12 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
out_reg:
ret = tracing_arm_snapshot(tr);
if (ret < 0)
- goto out;
+ return ret;
ret = register_ftrace_function_probe(glob, tr, ops, count);
if (ret < 0)
tracing_disarm_snapshot(tr);
- out:
+
return ret < 0 ? ret : 0;
}
@@ -10141,10 +10134,10 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m
ret = print_trace_line(&iter);
if (ret != TRACE_TYPE_NO_CONSUME)
trace_consume(&iter);
+
+ trace_printk_seq(&iter.seq);
}
touch_nmi_watchdog();
-
- trace_printk_seq(&iter.seq);
}
if (!cnt)
@@ -10479,7 +10472,7 @@ __init static int tracer_alloc_buffers(void)
BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE);
if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
- goto out;
+ return -ENOMEM;
if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL))
goto out_free_buffer_mask;
@@ -10592,7 +10585,6 @@ out_free_cpumask:
free_cpumask_var(global_trace.tracing_cpumask);
out_free_buffer_mask:
free_cpumask_var(tracing_buffer_mask);
-out:
return ret;
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 04ea327198ba..9b2ae7652cbc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -818,13 +818,15 @@ static inline void __init disable_tracing_selftest(const char *reason)
extern void *head_page(struct trace_array_cpu *data);
extern unsigned long long ns2usecs(u64 nsec);
-extern int
-trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
-extern int
-trace_vprintk(unsigned long ip, const char *fmt, va_list args);
-extern int
-trace_array_vprintk(struct trace_array *tr,
- unsigned long ip, const char *fmt, va_list args);
+
+__printf(2, 0)
+int trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
+__printf(2, 0)
+int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
+__printf(3, 0)
+int trace_array_vprintk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, va_list args);
+__printf(3, 4)
int trace_array_printk_buf(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, ...);
void trace_printk_seq(struct trace_seq *s);
@@ -1228,6 +1230,7 @@ bool ftrace_event_is_function(struct trace_event_call *call);
*/
struct trace_parser {
bool cont;
+ bool fail;
char *buffer;
unsigned idx;
unsigned size;
@@ -1235,7 +1238,7 @@ struct trace_parser {
static inline bool trace_parser_loaded(struct trace_parser *parser)
{
- return (parser->idx != 0);
+ return !parser->fail && parser->idx != 0;
}
static inline bool trace_parser_cont(struct trace_parser *parser)
@@ -1249,6 +1252,11 @@ static inline void trace_parser_clear(struct trace_parser *parser)
parser->idx = 0;
}
+static inline void trace_parser_fail(struct trace_parser *parser)
+{
+ parser->fail = true;
+}
+
extern int trace_parser_get_init(struct trace_parser *parser, int size);
extern void trace_parser_put(struct trace_parser *parser);
extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
@@ -1729,6 +1737,9 @@ extern int event_enable_register_trigger(char *glob,
extern void event_enable_unregister_trigger(char *glob,
struct event_trigger_data *test,
struct trace_event_file *file);
+extern struct event_trigger_data *
+trigger_data_alloc(struct event_command *cmd_ops, char *cmd, char *param,
+ void *private_data);
extern void trigger_data_free(struct event_trigger_data *data);
extern int event_trigger_init(struct event_trigger_data *data);
extern int trace_event_trigger_enable_disable(struct trace_event_file *file,
@@ -1755,11 +1766,6 @@ extern bool event_trigger_check_remove(const char *glob);
extern bool event_trigger_empty_param(const char *param);
extern int event_trigger_separate_filter(char *param_and_filter, char **param,
char **filter, bool param_required);
-extern struct event_trigger_data *
-event_trigger_alloc(struct event_command *cmd_ops,
- char *cmd,
- char *param,
- void *private_data);
extern int event_trigger_parse_num(char *trigger,
struct event_trigger_data *trigger_data);
extern int event_trigger_set_filter(struct event_command *cmd_ops,
@@ -2145,7 +2151,7 @@ static inline bool is_good_system_name(const char *name)
static inline void sanitize_event_name(char *name)
{
while (*name++ != '\0')
- if (*name == ':' || *name == '.')
+ if (*name == ':' || *name == '.' || *name == '*')
*name = '_';
}
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index 4376887e0d8a..c9b0533407ed 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -16,7 +16,7 @@
#include "trace_output.h" /* for trace_event_sem */
#include "trace_dynevent.h"
-static DEFINE_MUTEX(dyn_event_ops_mutex);
+DEFINE_MUTEX(dyn_event_ops_mutex);
static LIST_HEAD(dyn_event_ops_list);
bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call)
@@ -125,6 +125,20 @@ out:
return ret;
}
+/*
+ * Locked version of event creation. The event creation must be protected by
+ * dyn_event_ops_mutex because of protecting trace_probe_log.
+ */
+int dyn_event_create(const char *raw_command, struct dyn_event_operations *type)
+{
+ int ret;
+
+ mutex_lock(&dyn_event_ops_mutex);
+ ret = type->create(raw_command);
+ mutex_unlock(&dyn_event_ops_mutex);
+ return ret;
+}
+
static int create_dyn_event(const char *raw_command)
{
struct dyn_event_operations *ops;
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
index 936477a111d3..beee3f8d7544 100644
--- a/kernel/trace/trace_dynevent.h
+++ b/kernel/trace/trace_dynevent.h
@@ -100,6 +100,7 @@ void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos);
void dyn_event_seq_stop(struct seq_file *m, void *v);
int dyn_events_release_all(struct dyn_event_operations *type);
int dyn_event_release(const char *raw_command, struct dyn_event_operations *type);
+int dyn_event_create(const char *raw_command, struct dyn_event_operations *type);
/*
* for_each_dyn_event - iterate over the dyn_event list
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 11dea25ef880..dbea76058863 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -470,6 +470,7 @@ static void test_event_printk(struct trace_event_call *call)
case '%':
continue;
case 'p':
+ do_pointer:
/* Find dereferencing fields */
switch (fmt[i + 1]) {
case 'B': case 'R': case 'r':
@@ -498,6 +499,12 @@ static void test_event_printk(struct trace_event_call *call)
continue;
if (fmt[i + j] == '*') {
star = true;
+ /* Handle %*pbl case */
+ if (!j && fmt[i + 1] == 'p') {
+ arg++;
+ i++;
+ goto do_pointer;
+ }
continue;
}
if ((fmt[i + j] == 's')) {
@@ -2872,7 +2879,10 @@ __register_event(struct trace_event_call *call, struct module *mod)
if (ret < 0)
return ret;
+ down_write(&trace_event_sem);
list_add(&call->list, &ftrace_events);
+ up_write(&trace_event_sem);
+
if (call->flags & TRACE_EVENT_FL_DYNAMIC)
atomic_set(&call->refcnt, 0);
else
@@ -3464,6 +3474,8 @@ __trace_add_event_dirs(struct trace_array *tr)
struct trace_event_call *call;
int ret;
+ lockdep_assert_held(&trace_event_sem);
+
list_for_each_entry(call, &ftrace_events, list) {
ret = __trace_add_new_event(call, tr);
if (ret < 0)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 0c611b281a5b..f50c2ad43f3d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -808,7 +808,7 @@ static __always_inline char *test_string(char *str)
kstr = ubuf->buffer;
/* For safety, do not trust the string pointer */
- if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE))
+ if (strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE) < 0)
return NULL;
return kstr;
}
@@ -827,7 +827,7 @@ static __always_inline char *test_ustring(char *str)
/* user space address? */
ustr = (char __user *)str;
- if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE))
+ if (strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE) < 0)
return NULL;
return kstr;
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 4ebafc655223..3379e14d38e9 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -5249,17 +5249,94 @@ hist_trigger_actions(struct hist_trigger_data *hist_data,
}
}
+/*
+ * The hist_pad structure is used to save information to create
+ * a histogram from the histogram trigger. It's too big to store
+ * on the stack, so when the histogram trigger is initialized
+ * a percpu array of 4 hist_pad structures is allocated.
+ * This will cover every context from normal, softirq, irq and NMI
+ * in the very unlikely event that a tigger happens at each of
+ * these contexts and interrupts a currently active trigger.
+ */
+struct hist_pad {
+ unsigned long entries[HIST_STACKTRACE_DEPTH];
+ u64 var_ref_vals[TRACING_MAP_VARS_MAX];
+ char compound_key[HIST_KEY_SIZE_MAX];
+};
+
+static struct hist_pad __percpu *hist_pads;
+static DEFINE_PER_CPU(int, hist_pad_cnt);
+static refcount_t hist_pad_ref;
+
+/* One hist_pad for every context (normal, softirq, irq, NMI) */
+#define MAX_HIST_CNT 4
+
+static int alloc_hist_pad(void)
+{
+ lockdep_assert_held(&event_mutex);
+
+ if (refcount_read(&hist_pad_ref)) {
+ refcount_inc(&hist_pad_ref);
+ return 0;
+ }
+
+ hist_pads = __alloc_percpu(sizeof(struct hist_pad) * MAX_HIST_CNT,
+ __alignof__(struct hist_pad));
+ if (!hist_pads)
+ return -ENOMEM;
+
+ refcount_set(&hist_pad_ref, 1);
+ return 0;
+}
+
+static void free_hist_pad(void)
+{
+ lockdep_assert_held(&event_mutex);
+
+ if (!refcount_dec_and_test(&hist_pad_ref))
+ return;
+
+ free_percpu(hist_pads);
+ hist_pads = NULL;
+}
+
+static struct hist_pad *get_hist_pad(void)
+{
+ struct hist_pad *hist_pad;
+ int cnt;
+
+ if (WARN_ON_ONCE(!hist_pads))
+ return NULL;
+
+ preempt_disable();
+
+ hist_pad = per_cpu_ptr(hist_pads, smp_processor_id());
+
+ if (this_cpu_read(hist_pad_cnt) == MAX_HIST_CNT) {
+ preempt_enable();
+ return NULL;
+ }
+
+ cnt = this_cpu_inc_return(hist_pad_cnt) - 1;
+
+ return &hist_pad[cnt];
+}
+
+static void put_hist_pad(void)
+{
+ this_cpu_dec(hist_pad_cnt);
+ preempt_enable();
+}
+
static void event_hist_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe)
{
struct hist_trigger_data *hist_data = data->private_data;
bool use_compound_key = (hist_data->n_keys > 1);
- unsigned long entries[HIST_STACKTRACE_DEPTH];
- u64 var_ref_vals[TRACING_MAP_VARS_MAX];
- char compound_key[HIST_KEY_SIZE_MAX];
struct tracing_map_elt *elt = NULL;
struct hist_field *key_field;
+ struct hist_pad *hist_pad;
u64 field_contents;
void *key = NULL;
unsigned int i;
@@ -5267,12 +5344,18 @@ static void event_hist_trigger(struct event_trigger_data *data,
if (unlikely(!rbe))
return;
- memset(compound_key, 0, hist_data->key_size);
+ hist_pad = get_hist_pad();
+ if (!hist_pad)
+ return;
+
+ memset(hist_pad->compound_key, 0, hist_data->key_size);
for_each_hist_key_field(i, hist_data) {
key_field = hist_data->fields[i];
if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
+ unsigned long *entries = hist_pad->entries;
+
memset(entries, 0, HIST_STACKTRACE_SIZE);
if (key_field->field) {
unsigned long *stack, n_entries;
@@ -5296,26 +5379,31 @@ static void event_hist_trigger(struct event_trigger_data *data,
}
if (use_compound_key)
- add_to_key(compound_key, key, key_field, rec);
+ add_to_key(hist_pad->compound_key, key, key_field, rec);
}
if (use_compound_key)
- key = compound_key;
+ key = hist_pad->compound_key;
if (hist_data->n_var_refs &&
- !resolve_var_refs(hist_data, key, var_ref_vals, false))
- return;
+ !resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, false))
+ goto out;
elt = tracing_map_insert(hist_data->map, key);
if (!elt)
- return;
+ goto out;
- hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals);
+ hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, hist_pad->var_ref_vals);
- if (resolve_var_refs(hist_data, key, var_ref_vals, true))
- hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals);
+ if (resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, true)) {
+ hist_trigger_actions(hist_data, elt, buffer, rec, rbe,
+ key, hist_pad->var_ref_vals);
+ }
hist_poll_wakeup();
+
+ out:
+ put_hist_pad();
}
static void hist_trigger_stacktrace_print(struct seq_file *m,
@@ -6160,6 +6248,9 @@ static int event_hist_trigger_init(struct event_trigger_data *data)
{
struct hist_trigger_data *hist_data = data->private_data;
+ if (alloc_hist_pad() < 0)
+ return -ENOMEM;
+
if (!data->ref && hist_data->attrs->name)
save_named_trigger(hist_data->attrs->name, data);
@@ -6204,6 +6295,7 @@ static void event_hist_trigger_free(struct event_trigger_data *data)
destroy_hist_data(hist_data);
}
+ free_hist_pad();
}
static struct event_trigger_ops event_hist_trigger_ops = {
@@ -6219,9 +6311,7 @@ static int event_hist_trigger_named_init(struct event_trigger_data *data)
save_named_trigger(data->named_data->name, data);
- event_hist_trigger_init(data->named_data);
-
- return 0;
+ return event_hist_trigger_init(data->named_data);
}
static void event_hist_trigger_named_free(struct event_trigger_data *data)
@@ -6708,7 +6798,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
return PTR_ERR(hist_data);
}
- trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data);
+ trigger_data = trigger_data_alloc(cmd_ops, cmd, param, hist_data);
if (!trigger_data) {
ret = -ENOMEM;
goto out_free;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index a5e3d6acf1e1..d5dbda9b0e4b 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -825,7 +825,7 @@ out:
}
/**
- * event_trigger_alloc - allocate and init event_trigger_data for a trigger
+ * trigger_data_alloc - allocate and init event_trigger_data for a trigger
* @cmd_ops: The event_command operations for the trigger
* @cmd: The cmd string
* @param: The param string
@@ -836,14 +836,14 @@ out:
* trigger_ops to assign to the event_trigger_data. @private_data can
* also be passed in and associated with the event_trigger_data.
*
- * Use event_trigger_free() to free an event_trigger_data object.
+ * Use trigger_data_free() to free an event_trigger_data object.
*
* Return: The trigger_data object success, NULL otherwise
*/
-struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops,
- char *cmd,
- char *param,
- void *private_data)
+struct event_trigger_data *trigger_data_alloc(struct event_command *cmd_ops,
+ char *cmd,
+ char *param,
+ void *private_data)
{
struct event_trigger_data *trigger_data;
struct event_trigger_ops *trigger_ops;
@@ -1010,13 +1010,13 @@ event_trigger_parse(struct event_command *cmd_ops,
return ret;
ret = -ENOMEM;
- trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file);
+ trigger_data = trigger_data_alloc(cmd_ops, cmd, param, file);
if (!trigger_data)
goto out;
if (remove) {
event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
- kfree(trigger_data);
+ trigger_data_free(trigger_data);
ret = 0;
goto out;
}
@@ -1043,7 +1043,7 @@ event_trigger_parse(struct event_command *cmd_ops,
out_free:
event_trigger_reset_filter(cmd_ops, trigger_data);
- kfree(trigger_data);
+ trigger_data_free(trigger_data);
goto out;
}
@@ -1581,7 +1581,7 @@ stacktrace_trigger(struct event_trigger_data *data,
struct trace_event_file *file = data->private_data;
if (file)
- __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP);
+ __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP);
else
trace_dump_stack(STACK_SKIP);
}
@@ -1814,7 +1814,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
enable_data->enable = enable;
enable_data->file = event_enable_file;
- trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data);
+ trigger_data = trigger_data_alloc(cmd_ops, cmd, param, enable_data);
if (!trigger_data) {
kfree(enable_data);
goto out;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index fbbc3c719d2f..fbb6cf8dc047 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -574,11 +574,7 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
static __always_inline void trace_stack(struct trace_array *tr)
{
- unsigned int trace_ctx;
-
- trace_ctx = tracing_gen_ctx();
-
- __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP);
+ __trace_stack(tr, tracing_gen_ctx_dec(), FTRACE_STACK_SKIP);
}
static void
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 59857a1ee44c..628c25693cef 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
if (cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter.buffer_iter[cpu] =
- ring_buffer_read_prepare(iter.array_buffer->buffer,
- cpu, GFP_ATOMIC);
- ring_buffer_read_start(iter.buffer_iter[cpu]);
+ ring_buffer_read_start(iter.array_buffer->buffer,
+ cpu, GFP_ATOMIC);
tracing_iter_reset(&iter, cpu);
}
} else {
iter.cpu_file = cpu_file;
iter.buffer_iter[cpu_file] =
- ring_buffer_read_prepare(iter.array_buffer->buffer,
+ ring_buffer_read_start(iter.array_buffer->buffer,
cpu_file, GFP_ATOMIC);
- ring_buffer_read_start(iter.buffer_iter[cpu_file]);
tracing_iter_reset(&iter, cpu_file);
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 935a886af40c..6b9c3f3f870f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1090,7 +1090,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command)
if (raw_command[0] == '-')
return dyn_event_release(raw_command, &trace_kprobe_ops);
- ret = trace_kprobe_create(raw_command);
+ ret = dyn_event_create(raw_command, &trace_kprobe_ops);
return ret == -ECANCELED ? -EINVAL : ret;
}
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index a94790f5cda7..216247913980 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -665,8 +665,8 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u
entry = ring_buffer_event_data(event);
- memcpy(&entry->caller, fstack->calls, size);
entry->size = fstack->nr_entries;
+ memcpy(&entry->caller, fstack->calls, size);
if (!call_filter_check_discard(call, entry, buffer, event))
trace_buffer_unlock_commit_nostack(buffer, event);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 6e7090e8bf30..1ecc28a3946a 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -950,11 +950,12 @@ enum print_line_t print_event_fields(struct trace_iterator *iter,
struct trace_event_call *call;
struct list_head *head;
+ lockdep_assert_held_read(&trace_event_sem);
+
/* ftrace defined events have separate call structures */
if (event->type <= __TRACE_LAST_TYPE) {
bool found = false;
- down_read(&trace_event_sem);
list_for_each_entry(call, &ftrace_events, list) {
if (call->event.type == event->type) {
found = true;
@@ -964,7 +965,6 @@ enum print_line_t print_event_fields(struct trace_iterator *iter,
if (call->event.type > __TRACE_LAST_TYPE)
break;
}
- up_read(&trace_event_sem);
if (!found) {
trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type);
goto out;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 578919962e5d..055f5164bd96 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -154,9 +154,12 @@ fail:
}
static struct trace_probe_log trace_probe_log;
+extern struct mutex dyn_event_ops_mutex;
void trace_probe_log_init(const char *subsystem, int argc, const char **argv)
{
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
trace_probe_log.subsystem = subsystem;
trace_probe_log.argc = argc;
trace_probe_log.argv = argv;
@@ -165,11 +168,15 @@ void trace_probe_log_init(const char *subsystem, int argc, const char **argv)
void trace_probe_log_clear(void)
{
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
memset(&trace_probe_log, 0, sizeof(trace_probe_log));
}
void trace_probe_log_set_index(int index)
{
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
trace_probe_log.index = index;
}
@@ -178,6 +185,8 @@ void __trace_probe_log_err(int offset, int err_type)
char *command, *p;
int i, len = 0, pos = 0;
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
if (!trace_probe_log.argv)
return;
@@ -648,7 +657,7 @@ static int parse_btf_arg(char *varname,
ret = query_btf_context(ctx);
if (ret < 0 || ctx->nr_params == 0) {
trace_probe_log_err(ctx->offset, NO_BTF_ENTRY);
- return PTR_ERR(params);
+ return -ENOENT;
}
}
params = ctx->params;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index b085a8a164ea..9916677acf24 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -739,7 +739,7 @@ static int create_or_delete_trace_uprobe(const char *raw_command)
if (raw_command[0] == '-')
return dyn_event_release(raw_command, &trace_uprobe_ops);
- ret = trace_uprobe_create(raw_command);
+ ret = dyn_event_create(raw_command, &trace_uprobe_ops);
return ret == -ECANCELED ? -EINVAL : ret;
}
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 696406939be5..78f4c4255358 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -212,7 +212,7 @@ void put_ucounts(struct ucounts *ucounts)
}
}
-static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
+static inline bool atomic_long_inc_below(atomic_long_t *v, long u)
{
long c, old;
c = atomic_long_read(v);
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
index 2ef2e1b80091..2f844c279a3e 100644
--- a/kernel/vhost_task.c
+++ b/kernel/vhost_task.c
@@ -111,7 +111,7 @@ EXPORT_SYMBOL_GPL(vhost_task_stop);
* @arg: data to be passed to fn and handled_kill
* @name: the thread's name
*
- * This returns a specialized task for use by the vhost layer or NULL on
+ * This returns a specialized task for use by the vhost layer or ERR_PTR() on
* failure. The returned task is inactive, and the caller must fire it up
* through vhost_task_start().
*/
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4dc72540c3b0..8fbb4385e814 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -47,6 +47,7 @@ int __read_mostly watchdog_user_enabled = 1;
static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
static int __read_mostly watchdog_softlockup_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
+static int __read_mostly watchdog_thresh_next;
static int __read_mostly watchdog_hardlockup_available;
struct cpumask watchdog_cpumask __read_mostly;
@@ -863,12 +864,20 @@ int lockup_detector_offline_cpu(unsigned int cpu)
return 0;
}
-static void __lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(bool thresh_changed)
{
cpus_read_lock();
watchdog_hardlockup_stop();
softlockup_stop_all();
+ /*
+ * To prevent watchdog_timer_fn from using the old interval and
+ * the new watchdog_thresh at the same time, which could lead to
+ * false softlockup reports, it is necessary to update the
+ * watchdog_thresh after the softlockup is completed.
+ */
+ if (thresh_changed)
+ watchdog_thresh = READ_ONCE(watchdog_thresh_next);
set_sample_period();
lockup_detector_update_enable();
if (watchdog_enabled && watchdog_thresh)
@@ -881,7 +890,7 @@ static void __lockup_detector_reconfigure(void)
void lockup_detector_reconfigure(void)
{
mutex_lock(&watchdog_mutex);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
mutex_unlock(&watchdog_mutex);
}
@@ -901,27 +910,29 @@ static __init void lockup_detector_setup(void)
return;
mutex_lock(&watchdog_mutex);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
softlockup_initialized = true;
mutex_unlock(&watchdog_mutex);
}
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
-static void __lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(bool thresh_changed)
{
cpus_read_lock();
watchdog_hardlockup_stop();
+ if (thresh_changed)
+ watchdog_thresh = READ_ONCE(watchdog_thresh_next);
lockup_detector_update_enable();
watchdog_hardlockup_start();
cpus_read_unlock();
}
void lockup_detector_reconfigure(void)
{
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
}
static inline void lockup_detector_setup(void)
{
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
}
#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
@@ -939,11 +950,11 @@ void lockup_detector_soft_poweroff(void)
#ifdef CONFIG_SYSCTL
/* Propagate any changes to the watchdog infrastructure */
-static void proc_watchdog_update(void)
+static void proc_watchdog_update(bool thresh_changed)
{
/* Remove impossible cpus to keep sysctl output clean. */
cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(thresh_changed);
}
/*
@@ -976,7 +987,7 @@ static int proc_watchdog_common(int which, const struct ctl_table *table, int wr
old = READ_ONCE(*param);
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!err && old != READ_ONCE(*param))
- proc_watchdog_update();
+ proc_watchdog_update(false);
}
mutex_unlock(&watchdog_mutex);
return err;
@@ -1027,11 +1038,13 @@ static int proc_watchdog_thresh(const struct ctl_table *table, int write,
mutex_lock(&watchdog_mutex);
- old = READ_ONCE(watchdog_thresh);
+ watchdog_thresh_next = READ_ONCE(watchdog_thresh);
+
+ old = watchdog_thresh_next;
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!err && write && old != READ_ONCE(watchdog_thresh))
- proc_watchdog_update();
+ if (!err && write && old != READ_ONCE(watchdog_thresh_next))
+ proc_watchdog_update(true);
mutex_unlock(&watchdog_mutex);
return err;
@@ -1052,7 +1065,7 @@ static int proc_watchdog_cpumask(const struct ctl_table *table, int write,
err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
if (!err && write)
- proc_watchdog_update();
+ proc_watchdog_update(false);
mutex_unlock(&watchdog_mutex);
return err;
@@ -1072,7 +1085,7 @@ static struct ctl_table watchdog_sysctls[] = {
},
{
.procname = "watchdog_thresh",
- .data = &watchdog_thresh,
+ .data = &watchdog_thresh_next,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_watchdog_thresh,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a9d64e08dffc..3c87eb98609c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -7731,7 +7731,8 @@ void __init workqueue_init_early(void)
restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);
-
+ cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs();