summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c11
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/verifier.c113
-rw-r--r--kernel/dma/contiguous.c3
-rw-r--r--kernel/events/core.c52
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/events/uprobes.c13
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/kexec_elf.c2
-rw-r--r--kernel/locking/lockdep.c3
-rw-r--r--kernel/locking/semaphore.c13
-rw-r--r--kernel/module/Kconfig1
-rw-r--r--kernel/panic.c6
-rw-r--r--kernel/params.c10
-rw-r--r--kernel/sched/core.c114
-rw-r--r--kernel/sched/cpudeadline.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c81
-rw-r--r--kernel/sched/deadline.c6
-rw-r--r--kernel/sched/fair.c40
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h28
-rw-r--r--kernel/sched/topology.c7
-rw-r--r--kernel/time/tick-common.c22
-rw-r--r--kernel/trace/bpf_trace.c9
-rw-r--r--kernel/trace/ftrace.c8
-rw-r--r--kernel/trace/ring_buffer.c4
-rw-r--r--kernel/trace/trace.c28
-rw-r--r--kernel/trace/trace.h1
-rw-r--r--kernel/trace/trace_boot.c2
-rw-r--r--kernel/trace/trace_events.c73
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_events_hist.c133
-rw-r--r--kernel/trace/trace_events_synth.c37
-rw-r--r--kernel/trace/trace_functions_graph.c1
-rw-r--r--kernel/trace/trace_irqsoff.c2
-rw-r--r--kernel/trace/trace_osnoise.c1
-rw-r--r--kernel/trace/trace_output.c4
-rw-r--r--kernel/trace/trace_probe.c28
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/watch_queue.c9
40 files changed, 654 insertions, 229 deletions
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index ee1c7b77096e..fbbf3b6b9f83 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -162,6 +162,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
void *, value, u64, flags, gfp_t, gfp_flags)
{
struct bpf_local_storage_data *sdata;
+ bool nobusy;
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
@@ -170,21 +171,21 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
if (!cgroup)
return (unsigned long)NULL;
- if (!bpf_cgrp_storage_trylock())
- return (unsigned long)NULL;
+ nobusy = bpf_cgrp_storage_trylock();
- sdata = cgroup_storage_lookup(cgroup, map, true);
+ sdata = cgroup_storage_lookup(cgroup, map, nobusy);
if (sdata)
goto unlock;
/* only allocate new storage, when the cgroup is refcounted */
if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
- (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy)
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
value, BPF_NOEXIST, gfp_flags);
unlock:
- bpf_cgrp_storage_unlock();
+ if (nobusy)
+ bpf_cgrp_storage_unlock();
return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 02f327f05fd6..81fd1bb99416 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2893,7 +2893,7 @@ void __weak bpf_jit_compile(struct bpf_prog *prog)
{
}
-bool __weak bpf_helper_changes_pkt_data(void *func)
+bool __weak bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
{
return false;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d6a4102312fa..756e179a1efe 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2636,16 +2636,36 @@ static int cmp_subprogs(const void *a, const void *b)
((struct bpf_subprog_info *)b)->start;
}
+/* Find subprogram that contains instruction at 'off' */
+static struct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *vals = env->subprog_info;
+ int l, r, m;
+
+ if (off >= env->prog->len || off < 0 || env->subprog_cnt == 0)
+ return NULL;
+
+ l = 0;
+ r = env->subprog_cnt - 1;
+ while (l < r) {
+ m = l + (r - l + 1) / 2;
+ if (vals[m].start <= off)
+ l = m;
+ else
+ r = m - 1;
+ }
+ return &vals[l];
+}
+
+/* Find subprogram that starts exactly at 'off' */
static int find_subprog(struct bpf_verifier_env *env, int off)
{
struct bpf_subprog_info *p;
- p = bsearch(&off, env->subprog_info, env->subprog_cnt,
- sizeof(env->subprog_info[0]), cmp_subprogs);
- if (!p)
+ p = find_containing_subprog(env, off);
+ if (!p || p->start != off)
return -ENOENT;
return p - env->subprog_info;
-
}
static int add_subprog(struct bpf_verifier_env *env, int off)
@@ -9344,6 +9364,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (env->log.level & BPF_LOG_LEVEL)
verbose(env, "Func#%d is global and valid. Skipping.\n", subprog);
+ if (env->subprog_info[subprog].changes_pkt_data)
+ clear_all_pkt_pointers(env);
clear_caller_saved_regs(env, caller->regs);
/* All global functions return a 64-bit SCALAR_VALUE */
@@ -9987,7 +10009,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
/* With LD_ABS/IND some JITs save/restore skb from r1. */
- changes_data = bpf_helper_changes_pkt_data(fn->func);
+ changes_data = bpf_helper_changes_pkt_data(func_id);
if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n",
func_id_name(func_id), func_id);
@@ -15094,6 +15116,29 @@ static int check_return_code(struct bpf_verifier_env *env)
return 0;
}
+static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *subprog;
+
+ subprog = find_containing_subprog(env, off);
+ subprog->changes_pkt_data = true;
+}
+
+/* 't' is an index of a call-site.
+ * 'w' is a callee entry point.
+ * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED.
+ * Rely on DFS traversal order and absence of recursive calls to guarantee that
+ * callee's change_pkt_data marks would be correct at that moment.
+ */
+static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w)
+{
+ struct bpf_subprog_info *caller, *callee;
+
+ caller = find_containing_subprog(env, t);
+ callee = find_containing_subprog(env, w);
+ caller->changes_pkt_data |= callee->changes_pkt_data;
+}
+
/* non-recursive DFS pseudo code
* 1 procedure DFS-iterative(G,v):
* 2 label v as discovered
@@ -15227,6 +15272,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
bool visit_callee)
{
int ret, insn_sz;
+ int w;
insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
@@ -15238,8 +15284,10 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
mark_jmp_point(env, t + insn_sz);
if (visit_callee) {
+ w = t + insns[t].imm + 1;
mark_prune_point(env, t);
- ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
+ merge_callee_effects(env, t, w);
+ ret = push_insn(t, w, BRANCH, env);
}
return ret;
}
@@ -15291,6 +15339,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
mark_prune_point(env, t);
mark_jmp_point(env, t);
}
+ if (bpf_helper_call(insn) && bpf_helper_changes_pkt_data(insn->imm))
+ mark_subprog_changes_pkt_data(env, t);
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
struct bpf_kfunc_call_arg_meta meta;
@@ -15412,6 +15462,7 @@ static int check_cfg(struct bpf_verifier_env *env)
}
}
ret = 0; /* cfg looks good */
+ env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data;
err_free:
kvfree(insn_state);
@@ -18572,6 +18623,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
}
func[i]->aux->num_exentries = num_exentries;
func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
+ func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data;
func[i] = bpf_int_jit_compile(func[i]);
if (!func[i]->jited) {
err = -ENOTSUPP;
@@ -19856,6 +19908,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
}
if (tgt_prog) {
struct bpf_prog_aux *aux = tgt_prog->aux;
+ bool tgt_changes_pkt_data;
if (bpf_prog_is_dev_bound(prog->aux) &&
!bpf_prog_dev_bound_match(prog, tgt_prog)) {
@@ -19884,6 +19937,14 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
"Extension programs should be JITed\n");
return -EINVAL;
}
+ tgt_changes_pkt_data = aux->func
+ ? aux->func[subprog]->aux->changes_pkt_data
+ : aux->changes_pkt_data;
+ if (prog->aux->changes_pkt_data && !tgt_changes_pkt_data) {
+ bpf_log(log,
+ "Extension program changes packet data, while original does not\n");
+ return -EINVAL;
+ }
}
if (!tgt_prog->jited) {
bpf_log(log, "Can attach to only JITed progs\n");
@@ -20106,6 +20167,33 @@ BTF_ID(func, __rcu_read_unlock)
#endif
BTF_SET_END(btf_id_deny)
+/* fexit and fmod_ret can't be used to attach to __noreturn functions.
+ * Currently, we must manually list all __noreturn functions here. Once a more
+ * robust solution is implemented, this workaround can be removed.
+ */
+BTF_SET_START(noreturn_deny)
+#ifdef CONFIG_IA32_EMULATION
+BTF_ID(func, __ia32_sys_exit)
+BTF_ID(func, __ia32_sys_exit_group)
+#endif
+#ifdef CONFIG_KUNIT
+BTF_ID(func, __kunit_abort)
+BTF_ID(func, kunit_try_catch_throw)
+#endif
+#ifdef CONFIG_MODULES
+BTF_ID(func, __module_put_and_kthread_exit)
+#endif
+#ifdef CONFIG_X86_64
+BTF_ID(func, __x64_sys_exit)
+BTF_ID(func, __x64_sys_exit_group)
+#endif
+BTF_ID(func, do_exit)
+BTF_ID(func, do_group_exit)
+BTF_ID(func, kthread_complete_and_exit)
+BTF_ID(func, kthread_exit)
+BTF_ID(func, make_task_dead)
+BTF_SET_END(noreturn_deny)
+
static bool can_be_sleepable(struct bpf_prog *prog)
{
if (prog->type == BPF_PROG_TYPE_TRACING) {
@@ -20194,6 +20282,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
} else if (prog->type == BPF_PROG_TYPE_TRACING &&
btf_id_set_contains(&btf_id_deny, btf_id)) {
return -EINVAL;
+ } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
+ prog->expected_attach_type == BPF_MODIFY_RETURN) &&
+ btf_id_set_contains(&noreturn_deny, btf_id)) {
+ verbose(env, "Attaching fexit/fmod_ret to __noreturn functions is rejected.\n");
+ return -EINVAL;
}
key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
@@ -20311,10 +20404,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret < 0)
goto skip_full_check;
- ret = check_attach_btf_id(env);
- if (ret)
- goto skip_full_check;
-
ret = resolve_pseudo_ldimm64(env);
if (ret < 0)
goto skip_full_check;
@@ -20329,6 +20418,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret < 0)
goto skip_full_check;
+ ret = check_attach_btf_id(env);
+ if (ret)
+ goto skip_full_check;
+
ret = do_check_subprogs(env);
ret = ret ?: do_check_main(env);
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index f005c66f378c..a60081979963 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -70,8 +70,7 @@ struct cma *dma_contiguous_default_area;
* Users, who want to set the size of global CMA area for their system
* should use cma= kernel parameter.
*/
-static const phys_addr_t size_bytes __initconst =
- (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M;
+#define size_bytes ((phys_addr_t)CMA_SIZE_MBYTES * SZ_1M)
static phys_addr_t size_cmdline __initdata = -1;
static phys_addr_t base_cmdline __initdata;
static phys_addr_t limit_cmdline __initdata;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4dd8936b5aa0..987807b1040a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2333,6 +2333,7 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
#define DETACH_GROUP 0x01UL
#define DETACH_CHILD 0x02UL
#define DETACH_DEAD 0x04UL
+#define DETACH_EXIT 0x08UL
/*
* Cross CPU call to remove a performance event
@@ -2347,6 +2348,7 @@ __perf_remove_from_context(struct perf_event *event,
void *info)
{
struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
+ enum perf_event_state state = PERF_EVENT_STATE_OFF;
unsigned long flags = (unsigned long)info;
if (ctx->is_active & EVENT_TIME) {
@@ -2358,16 +2360,19 @@ __perf_remove_from_context(struct perf_event *event,
* Ensure event_sched_out() switches to OFF, at the very least
* this avoids raising perf_pending_task() at this time.
*/
- if (flags & DETACH_DEAD)
+ if (flags & DETACH_EXIT)
+ state = PERF_EVENT_STATE_EXIT;
+ if (flags & DETACH_DEAD) {
event->pending_disable = 1;
+ state = PERF_EVENT_STATE_DEAD;
+ }
event_sched_out(event, ctx);
+ perf_event_set_state(event, min(event->state, state));
if (flags & DETACH_GROUP)
perf_group_detach(event);
if (flags & DETACH_CHILD)
perf_child_detach(event);
list_del_event(event, ctx);
- if (flags & DETACH_DEAD)
- event->state = PERF_EVENT_STATE_DEAD;
if (!pmu_ctx->nr_events) {
pmu_ctx->rotate_necessary = 0;
@@ -11556,6 +11561,21 @@ free_dev:
static struct lock_class_key cpuctx_mutex;
static struct lock_class_key cpuctx_lock;
+static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new)
+{
+ void *tmp, *val = idr_find(idr, id);
+
+ if (val != old)
+ return false;
+
+ tmp = idr_replace(idr, new, id);
+ if (IS_ERR(tmp))
+ return false;
+
+ WARN_ON_ONCE(tmp != val);
+ return true;
+}
+
int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
int cpu, ret, max = PERF_TYPE_MAX;
@@ -11577,7 +11597,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
if (type >= 0)
max = type;
- ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
+ ret = idr_alloc(&pmu_idr, NULL, max, 0, GFP_KERNEL);
if (ret < 0)
goto free_pdc;
@@ -11585,6 +11605,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
type = ret;
pmu->type = type;
+ atomic_set(&pmu->exclusive_cnt, 0);
if (pmu_bus_running && !pmu->dev) {
ret = pmu_dev_alloc(pmu);
@@ -11633,14 +11654,22 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
+ /*
+ * Now that the PMU is complete, make it visible to perf_try_init_event().
+ */
+ if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu))
+ goto free_context;
list_add_rcu(&pmu->entry, &pmus);
- atomic_set(&pmu->exclusive_cnt, 0);
+
ret = 0;
unlock:
mutex_unlock(&pmus_lock);
return ret;
+free_context:
+ free_percpu(pmu->cpu_pmu_context);
+
free_dev:
if (pmu->dev && pmu->dev != PMU_NULL_DEV) {
device_del(pmu->dev);
@@ -13116,12 +13145,7 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
mutex_lock(&parent_event->child_mutex);
}
- perf_remove_from_context(event, detach_flags);
-
- raw_spin_lock_irq(&ctx->lock);
- if (event->state > PERF_EVENT_STATE_EXIT)
- perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
- raw_spin_unlock_irq(&ctx->lock);
+ perf_remove_from_context(event, detach_flags | DETACH_EXIT);
/*
* Child events can be freed.
@@ -13395,6 +13419,9 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
+ get_ctx(child_ctx);
+ child_event->ctx = child_ctx;
+
pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
if (IS_ERR(pmu_ctx)) {
free_event(child_event);
@@ -13417,8 +13444,6 @@ inherit_event(struct perf_event *parent_event,
return NULL;
}
- get_ctx(child_ctx);
-
/*
* Make the child state follow the state of the parent event,
* not its attr.disabled bit. We hold the parent's mutex,
@@ -13439,7 +13464,6 @@ inherit_event(struct perf_event *parent_event,
local64_set(&hwc->period_left, sample_period);
}
- child_event->ctx = child_ctx;
child_event->overflow_handler = parent_event->overflow_handler;
child_event->overflow_handler_context
= parent_event->overflow_handler_context;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index b0930b418552..52de76ef8723 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -19,7 +19,7 @@
static void perf_output_wakeup(struct perf_output_handle *handle)
{
- atomic_set(&handle->rb->poll, EPOLLIN);
+ atomic_set(&handle->rb->poll, EPOLLIN | EPOLLRDNORM);
handle->event->pending_wakeup = 1;
irq_work_queue(&handle->event->pending_irq);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f5dfc2f22d79..a554f43d3ceb 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -159,6 +159,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
int err;
struct mmu_notifier_range range;
+ pte_t pte;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
addr + PAGE_SIZE);
@@ -178,6 +179,16 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
if (!page_vma_mapped_walk(&pvmw))
goto unlock;
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
+ pte = ptep_get(pvmw.pte);
+
+ /*
+ * Handle PFN swap PTES, such as device-exclusive ones, that actually
+ * map pages: simply trigger GUP again to fix it up.
+ */
+ if (unlikely(!pte_present(pte))) {
+ page_vma_mapped_walk_done(&pvmw);
+ goto unlock;
+ }
if (new_page) {
folio_get(new_folio);
@@ -192,7 +203,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
inc_mm_counter(mm, MM_ANONPAGES);
}
- flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte)));
+ flush_cache_page(vma, addr, pte_pfn(pte));
ptep_clear_flush(vma, addr, pvmw.pte);
if (new_page)
set_pte_at_notify(mm, addr, pvmw.pte,
diff --git a/kernel/fork.c b/kernel/fork.c
index 23efaa2c42e4..97f433fb4b5e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -518,6 +518,10 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
vma_numab_state_init(new);
dup_anon_vma_name(orig, new);
+ /* track_pfn_copy() will later take care of copying internal state. */
+ if (unlikely(new->vm_flags & VM_PFNMAP))
+ untrack_pfn_clear(new);
+
return new;
}
diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c
index d3689632e8b9..3a5c25b2adc9 100644
--- a/kernel/kexec_elf.c
+++ b/kernel/kexec_elf.c
@@ -390,7 +390,7 @@ int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr,
struct kexec_buf *kbuf,
unsigned long *lowest_load_addr)
{
- unsigned long lowest_addr = UINT_MAX;
+ unsigned long lowest_addr = ULONG_MAX;
int ret;
size_t i;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 3468d8230e5f..9419a79e8833 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -6141,6 +6141,9 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
hlist_del_rcu(&class->hash_entry);
WRITE_ONCE(class->key, NULL);
WRITE_ONCE(class->name, NULL);
+ /* Class allocated but not used, -1 in nr_unused_locks */
+ if (class->usage_mask == 0)
+ debug_atomic_dec(nr_unused_locks);
nr_lock_classes--;
__clear_bit(class - lock_classes, lock_classes_in_use);
if (class - lock_classes == max_lock_class_idx)
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 34bfae72f295..de9117c0e671 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -29,6 +29,7 @@
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/ftrace.h>
@@ -38,7 +39,7 @@ static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
static noinline int __down_killable(struct semaphore *sem);
static noinline int __down_timeout(struct semaphore *sem, long timeout);
-static noinline void __up(struct semaphore *sem);
+static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q);
/**
* down - acquire the semaphore
@@ -183,13 +184,16 @@ EXPORT_SYMBOL(down_timeout);
void __sched up(struct semaphore *sem)
{
unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
- __up(sem);
+ __up(sem, &wake_q);
raw_spin_unlock_irqrestore(&sem->lock, flags);
+ if (!wake_q_empty(&wake_q))
+ wake_up_q(&wake_q);
}
EXPORT_SYMBOL(up);
@@ -269,11 +273,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
}
-static noinline void __sched __up(struct semaphore *sem)
+static noinline void __sched __up(struct semaphore *sem,
+ struct wake_q_head *wake_q)
{
struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
struct semaphore_waiter, list);
list_del(&waiter->list);
waiter->up = true;
- wake_up_process(waiter->task);
+ wake_q_add(wake_q, waiter->task);
}
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 33a2e991f608..b411315ecd3c 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -229,6 +229,7 @@ comment "Do not forget to sign required modules with scripts/sign-file"
choice
prompt "Which hash algorithm should modules be signed with?"
depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ default MODULE_SIG_SHA512
help
This determines which sort of hashing algorithm will be used during
signature generation. This algorithm _must_ be built into the kernel
diff --git a/kernel/panic.c b/kernel/panic.c
index ef9f9a4e928d..d7973e975474 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -763,9 +763,15 @@ device_initcall(register_warn_debugfs);
*/
__visible noinstr void __stack_chk_fail(void)
{
+ unsigned long flags;
+
instrumentation_begin();
+ flags = user_access_save();
+
panic("stack-protector: Kernel stack is corrupted in: %pB",
__builtin_return_address(0));
+
+ user_access_restore(flags);
instrumentation_end();
}
EXPORT_SYMBOL(__stack_chk_fail);
diff --git a/kernel/params.c b/kernel/params.c
index 2d4a0564697e..e39ac5420cd6 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -759,7 +759,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
params[i].ops->free(params[i].arg);
}
-static struct module_kobject * __init locate_module_kobject(const char *name)
+struct module_kobject __modinit * lookup_or_create_module_kobject(const char *name)
{
struct module_kobject *mk;
struct kobject *kobj;
@@ -801,7 +801,7 @@ static void __init kernel_add_sysfs_param(const char *name,
struct module_kobject *mk;
int err;
- mk = locate_module_kobject(name);
+ mk = lookup_or_create_module_kobject(name);
if (!mk)
return;
@@ -872,7 +872,7 @@ static void __init version_sysfs_builtin(void)
int err;
for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
- mk = locate_module_kobject(vattr->module_name);
+ mk = lookup_or_create_module_kobject(vattr->module_name);
if (mk) {
err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
WARN_ON_ONCE(err);
@@ -945,7 +945,9 @@ struct kset *module_kset;
static void module_kobj_release(struct kobject *kobj)
{
struct module_kobject *mk = to_module_kobject(kobj);
- complete(mk->kobj_completion);
+
+ if (mk->kobj_completion)
+ complete(mk->kobj_completion);
}
const struct kobj_type module_ktype = {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 942734bf7347..760a6c3781cb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7406,18 +7406,13 @@ int sched_core_idle_cpu(int cpu)
* required to meet deadlines.
*/
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- enum cpu_util_type type,
- struct task_struct *p)
+ unsigned long *min,
+ unsigned long *max)
{
- unsigned long dl_util, util, irq, max;
+ unsigned long util, irq, scale;
struct rq *rq = cpu_rq(cpu);
- max = arch_scale_cpu_capacity(cpu);
-
- if (!uclamp_is_used() &&
- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
- return max;
- }
+ scale = arch_scale_cpu_capacity(cpu);
/*
* Early check to see if IRQ/steal time saturates the CPU, can be
@@ -7425,45 +7420,49 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
* update_irq_load_avg().
*/
irq = cpu_util_irq(rq);
- if (unlikely(irq >= max))
- return max;
+ if (unlikely(irq >= scale)) {
+ if (min)
+ *min = scale;
+ if (max)
+ *max = scale;
+ return scale;
+ }
+
+ if (min) {
+ /*
+ * The minimum utilization returns the highest level between:
+ * - the computed DL bandwidth needed with the IRQ pressure which
+ * steals time to the deadline task.
+ * - The minimum performance requirement for CFS and/or RT.
+ */
+ *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+
+ /*
+ * When an RT task is runnable and uclamp is not used, we must
+ * ensure that the task will run at maximum compute capacity.
+ */
+ if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+ *min = max(*min, scale);
+ }
/*
* Because the time spend on RT/DL tasks is visible as 'lost' time to
* CFS tasks and we use the same metric to track the effective
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
- *
- * CFS and RT utilization can be boosted or capped, depending on
- * utilization clamp constraints requested by currently RUNNABLE
- * tasks.
- * When there are no CFS RUNNABLE tasks, clamps are released and
- * frequency will be gracefully reduced with the utilization decay.
*/
util = util_cfs + cpu_util_rt(rq);
- if (type == FREQUENCY_UTIL)
- util = uclamp_rq_util_with(rq, util, p);
-
- dl_util = cpu_util_dl(rq);
+ util += cpu_util_dl(rq);
/*
- * For frequency selection we do not make cpu_util_dl() a permanent part
- * of this sum because we want to use cpu_bw_dl() later on, but we need
- * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
- * that we select f_max when there is no idle time.
- *
- * NOTE: numerical errors or stop class might cause us to not quite hit
- * saturation when we should -- something for later.
+ * The maximum hint is a soft bandwidth requirement, which can be lower
+ * than the actual utilization because of uclamp_max requirements.
*/
- if (util + dl_util >= max)
- return max;
+ if (max)
+ *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
- /*
- * OTOH, for energy computation we need the estimated running time, so
- * include util_dl and ignore dl_bw.
- */
- if (type == ENERGY_UTIL)
- util += dl_util;
+ if (util >= scale)
+ return scale;
/*
* There is still idle time; further improve the number by using the
@@ -7474,28 +7473,15 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
* U' = irq + --------- * U
* max
*/
- util = scale_irq_capacity(util, irq, max);
+ util = scale_irq_capacity(util, irq, scale);
util += irq;
- /*
- * Bandwidth required by DEADLINE must always be granted while, for
- * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
- * to gracefully reduce the frequency when no tasks show up for longer
- * periods of time.
- *
- * Ideally we would like to set bw_dl as min/guaranteed freq and util +
- * bw_dl as requested freq. However, cpufreq is not yet ready for such
- * an interface. So, we only do the latter for now.
- */
- if (type == FREQUENCY_UTIL)
- util += cpu_bw_dl(rq);
-
- return min(max, util);
+ return min(scale, util);
}
unsigned long sched_cpu_util(int cpu)
{
- return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
}
#endif /* CONFIG_SMP */
@@ -10048,7 +10034,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
+ rq->cpu_capacity = SCHED_CAPACITY_SCALE;
rq->balance_callback = &balance_push_callback;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -10494,7 +10480,7 @@ void sched_release_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-static struct task_group *sched_get_task_group(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk)
{
struct task_group *tg;
@@ -10506,13 +10492,7 @@ static struct task_group *sched_get_task_group(struct task_struct *tsk)
tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
-
- return tg;
-}
-
-static void sched_change_group(struct task_struct *tsk, struct task_group *group)
-{
- tsk->sched_task_group = group;
+ tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_change_group)
@@ -10533,19 +10513,10 @@ void sched_move_task(struct task_struct *tsk)
{
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
- struct task_group *group;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(tsk, &rf);
- /*
- * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
- * group changes.
- */
- group = sched_get_task_group(tsk);
- if (group == tsk->sched_task_group)
- goto unlock;
-
update_rq_clock(rq);
running = task_current(rq, tsk);
@@ -10556,7 +10527,7 @@ void sched_move_task(struct task_struct *tsk)
if (running)
put_prev_task(rq, tsk);
- sched_change_group(tsk, group);
+ sched_change_group(tsk);
if (queued)
enqueue_task(rq, tsk, queue_flags);
@@ -10570,7 +10541,6 @@ void sched_move_task(struct task_struct *tsk)
resched_curr(rq);
}
-unlock:
task_rq_unlock(rq, tsk, &rf);
}
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 57c92d751bcd..95baa12a1029 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -131,7 +131,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
if (!dl_task_fits_capacity(p, cpu)) {
cpumask_clear_cpu(cpu, later_mask);
- cap = capacity_orig_of(cpu);
+ cap = arch_scale_cpu_capacity(cpu);
if (cap > max_cap ||
(cpu == task_cpu(p) && cap == max_cap)) {
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index a49f136014ce..776be0549162 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -47,7 +47,7 @@ struct sugov_cpu {
u64 last_update;
unsigned long util;
- unsigned long bw_dl;
+ unsigned long bw_min;
/* The field below is for single-CPU policies only: */
#ifdef CONFIG_NO_HZ_COMMON
@@ -81,9 +81,20 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
if (!cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
- if (unlikely(sg_policy->limits_changed)) {
- sg_policy->limits_changed = false;
- sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
+ if (unlikely(READ_ONCE(sg_policy->limits_changed))) {
+ WRITE_ONCE(sg_policy->limits_changed, false);
+ sg_policy->need_freq_update = true;
+
+ /*
+ * The above limits_changed update must occur before the reads
+ * of policy limits in cpufreq_driver_resolve_freq() or a policy
+ * limits update might be missed, so use a memory barrier to
+ * ensure it.
+ *
+ * This pairs with the write memory barrier in sugov_limits().
+ */
+ smp_mb();
+
return true;
}
@@ -95,10 +106,22 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
- if (sg_policy->need_freq_update)
+ if (sg_policy->need_freq_update) {
sg_policy->need_freq_update = false;
- else if (sg_policy->next_freq == next_freq)
+ /*
+ * The policy limits have changed, but if the return value of
+ * cpufreq_driver_resolve_freq() after applying the new limits
+ * is still equal to the previously selected frequency, the
+ * driver callback need not be invoked unless the driver
+ * specifically wants that to happen on every update of the
+ * policy limits.
+ */
+ if (sg_policy->next_freq == next_freq &&
+ !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS))
+ return false;
+ } else if (sg_policy->next_freq == next_freq) {
return false;
+ }
sg_policy->next_freq = next_freq;
sg_policy->last_freq_update_time = time;
@@ -143,7 +166,6 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
- util = map_util_perf(util);
freq = map_util_freq(util, freq, max);
if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
@@ -153,14 +175,30 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
return cpufreq_driver_resolve_freq(policy, freq);
}
+unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
+ unsigned long min,
+ unsigned long max)
+{
+ /* Add dvfs headroom to actual utilization */
+ actual = map_util_perf(actual);
+ /* Actually we don't need to target the max performance */
+ if (actual < max)
+ max = actual;
+
+ /*
+ * Ensure at least minimum performance while providing more compute
+ * capacity when possible.
+ */
+ return max(min, max);
+}
+
static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
- unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu);
- struct rq *rq = cpu_rq(sg_cpu->cpu);
+ unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
- sg_cpu->bw_dl = cpu_bw_dl(rq);
- sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util,
- FREQUENCY_UTIL, NULL);
+ util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
+ sg_cpu->bw_min = min;
+ sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max);
}
/**
@@ -306,8 +344,8 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
*/
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
{
- if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
- sg_cpu->sg_policy->limits_changed = true;
+ if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min)
+ WRITE_ONCE(sg_cpu->sg_policy->limits_changed, true);
}
static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
@@ -407,8 +445,8 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
sg_cpu->util = prev_util;
- cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
- map_util_perf(sg_cpu->util), max_cap);
+ cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
+ sg_cpu->util, max_cap);
sg_cpu->sg_policy->last_freq_update_time = time;
}
@@ -817,7 +855,16 @@ static void sugov_limits(struct cpufreq_policy *policy)
mutex_unlock(&sg_policy->work_lock);
}
- sg_policy->limits_changed = true;
+ /*
+ * The limits_changed update below must take place before the updates
+ * of policy limits in cpufreq_set_policy() or a policy limits update
+ * might be missed, so use a memory barrier to ensure it.
+ *
+ * This pairs with the memory barrier in sugov_should_update_freq().
+ */
+ smp_wmb();
+
+ WRITE_ONCE(sg_policy->limits_changed, true);
}
struct cpufreq_governor schedutil_gov = {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b9e99bc3b1cf..a15cf7969953 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -132,7 +132,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask)
int i;
for_each_cpu_and(i, mask, cpu_active_mask)
- cap += capacity_orig_of(i);
+ cap += arch_scale_cpu_capacity(i);
return cap;
}
@@ -144,7 +144,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask)
static inline unsigned long dl_bw_capacity(int i)
{
if (!sched_asym_cpucap_active() &&
- capacity_orig_of(i) == SCHED_CAPACITY_SCALE) {
+ arch_scale_cpu_capacity(i) == SCHED_CAPACITY_SCALE) {
return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
} else {
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
@@ -2780,7 +2780,7 @@ int sched_dl_global_validate(void)
* value smaller than the currently allocated bandwidth in
* any of the root_domains.
*/
- for_each_possible_cpu(cpu) {
+ for_each_online_cpu(cpu) {
rcu_read_lock_sched();
if (dl_bw_visited(cpu, gen))
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2808dbdd0384..268e2a49b964 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4951,7 +4951,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
* To avoid overestimation of actual task utilization, skip updates if
* we cannot grant there is idle time in this CPU.
*/
- if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
+ if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
return;
/*
@@ -4999,14 +4999,14 @@ static inline int util_fits_cpu(unsigned long util,
return fits;
/*
- * We must use capacity_orig_of() for comparing against uclamp_min and
+ * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
* uclamp_max. We only care about capacity pressure (by using
* capacity_of()) for comparing against the real util.
*
* If a task is boosted to 1024 for example, we don't want a tiny
* pressure to skew the check whether it fits a CPU or not.
*
- * Similarly if a task is capped to capacity_orig_of(little_cpu), it
+ * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
* should fit a little cpu even if there's some pressure.
*
* Only exception is for thermal pressure since it has a direct impact
@@ -5018,7 +5018,7 @@ static inline int util_fits_cpu(unsigned long util,
* For uclamp_max, we can tolerate a drop in performance level as the
* goal is to cap the task. So it's okay if it's getting less.
*/
- capacity_orig = capacity_orig_of(cpu);
+ capacity_orig = arch_scale_cpu_capacity(cpu);
capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
/*
@@ -7515,7 +7515,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
* Look for the CPU with best capacity.
*/
else if (fits < 0)
- cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
+ cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
/*
* First, select CPU which fits better (-1 being better than 0).
@@ -7757,7 +7757,7 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
util = max(util, util_est);
}
- return min(util, capacity_orig_of(cpu));
+ return min(util, arch_scale_cpu_capacity(cpu));
}
unsigned long cpu_util_cfs(int cpu)
@@ -7859,7 +7859,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
for_each_cpu(cpu, pd_cpus) {
unsigned long util = cpu_util(cpu, p, -1, 0);
- busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
+ busy_time += effective_cpu_util(cpu, util, NULL, NULL);
}
eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
@@ -7882,7 +7882,7 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
for_each_cpu(cpu, pd_cpus) {
struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
- unsigned long eff_util;
+ unsigned long eff_util, min, max;
/*
* Performance domain frequency: utilization clamping
@@ -7891,7 +7891,23 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
- eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
+ eff_util = effective_cpu_util(cpu, util, &min, &max);
+
+ /* Task's uclamp can modify min and max value */
+ if (tsk && uclamp_is_used()) {
+ min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
+
+ /*
+ * If there is no active max uclamp constraint,
+ * directly use task's one, otherwise keep max.
+ */
+ if (uclamp_rq_is_idle(cpu_rq(cpu)))
+ max = uclamp_eff_value(p, UCLAMP_MAX);
+ else
+ max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
+ }
+
+ eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
max_util = max(max_util, eff_util);
}
@@ -9544,8 +9560,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
- cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
-
if (!capacity)
capacity = 1;
@@ -9621,7 +9635,7 @@ static inline int
check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
{
return ((rq->cpu_capacity * sd->imbalance_pct) <
- (rq->cpu_capacity_orig * 100));
+ (arch_scale_cpu_capacity(cpu_of(rq)) * 100));
}
/*
@@ -9632,7 +9646,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
{
return rq->misfit_task_load &&
- (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
+ (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
check_cpu_capacity(rq, sd));
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index b89223a97316..91b1ee0d81fc 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -519,7 +519,7 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
min_cap = uclamp_eff_value(p, UCLAMP_MIN);
max_cap = uclamp_eff_value(p, UCLAMP_MAX);
- cpu_cap = capacity_orig_of(cpu);
+ cpu_cap = arch_scale_cpu_capacity(cpu);
return cpu_cap >= min(min_cap, max_cap);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d48c6a292a83..60dc51f43dd9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1048,7 +1048,6 @@ struct rq {
struct sched_domain __rcu *sd;
unsigned long cpu_capacity;
- unsigned long cpu_capacity_orig;
struct balance_callback *balance_callback;
@@ -2985,29 +2984,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif
#ifdef CONFIG_SMP
-static inline unsigned long capacity_orig_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity_orig;
-}
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long *min,
+ unsigned long *max);
-/**
- * enum cpu_util_type - CPU utilization type
- * @FREQUENCY_UTIL: Utilization used to select frequency
- * @ENERGY_UTIL: Utilization used during energy calculation
- *
- * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
- * need to be aggregated differently depending on the usage made of them. This
- * enum is used within effective_cpu_util() to differentiate the types of
- * utilization expected by the callers, and adjust the aggregation accordingly.
- */
-enum cpu_util_type {
- FREQUENCY_UTIL,
- ENERGY_UTIL,
-};
+unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
+ unsigned long min,
+ unsigned long max);
-unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- enum cpu_util_type type,
- struct task_struct *p);
/*
* Verify the fitness of task @p to run on @cpu taking into account the
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 2ed884bb3621..c61698cff0f3 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2486,12 +2486,15 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
+ unsigned long capacity;
+
rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i);
+ capacity = arch_scale_cpu_capacity(i);
/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
- if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
- WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+ if (capacity > READ_ONCE(d.rd->max_cpu_capacity))
+ WRITE_ONCE(d.rd->max_cpu_capacity, capacity);
cpu_attach_domain(sd, d.rd, i);
}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 7f2b17fc8ce4..ecdb8c2b2cab 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -495,6 +495,7 @@ void tick_resume(void)
#ifdef CONFIG_SUSPEND
static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
+static DEFINE_WAIT_OVERRIDE_MAP(tick_freeze_map, LD_WAIT_SLEEP);
static unsigned int tick_freeze_depth;
/**
@@ -514,9 +515,22 @@ void tick_freeze(void)
if (tick_freeze_depth == num_online_cpus()) {
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), true);
+ /*
+ * All other CPUs have their interrupts disabled and are
+ * suspended to idle. Other tasks have been frozen so there
+ * is no scheduling happening. This means that there is no
+ * concurrency in the system at this point. Therefore it is
+ * okay to acquire a sleeping lock on PREEMPT_RT, such as a
+ * spinlock, because the lock cannot be held by other CPUs
+ * or threads and acquiring it cannot block.
+ *
+ * Inform lockdep about the situation.
+ */
+ lock_map_acquire_try(&tick_freeze_map);
system_state = SYSTEM_SUSPEND;
sched_clock_suspend();
timekeeping_suspend();
+ lock_map_release(&tick_freeze_map);
} else {
tick_suspend_local();
}
@@ -538,8 +552,16 @@ void tick_unfreeze(void)
raw_spin_lock(&tick_freeze_lock);
if (tick_freeze_depth == num_online_cpus()) {
+ /*
+ * Similar to tick_freeze(). On resumption the first CPU may
+ * acquire uncontended sleeping locks while other CPUs block on
+ * tick_freeze_lock.
+ */
+ lock_map_acquire_try(&tick_freeze_map);
timekeeping_resume();
sched_clock_resume();
+ lock_map_release(&tick_freeze_map);
+
system_state = SYSTEM_RUNNING;
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), false);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 9d8f60e0cb55..97f660a8ddc7 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -400,7 +400,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
.arg2_type = ARG_CONST_SIZE,
};
-static void __set_printk_clr_event(void)
+static void __set_printk_clr_event(struct work_struct *work)
{
/*
* This program might be calling bpf_trace_printk,
@@ -413,10 +413,11 @@ static void __set_printk_clr_event(void)
if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1))
pr_warn_ratelimited("could not enable bpf_trace_printk events");
}
+static DECLARE_WORK(set_printk_work, __set_printk_clr_event);
const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
{
- __set_printk_clr_event();
+ schedule_work(&set_printk_work);
return &bpf_trace_printk_proto;
}
@@ -459,7 +460,7 @@ static const struct bpf_func_proto bpf_trace_vprintk_proto = {
const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void)
{
- __set_printk_clr_event();
+ schedule_work(&set_printk_work);
return &bpf_trace_vprintk_proto;
}
@@ -853,7 +854,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
if (unlikely(is_global_init(current)))
return -EPERM;
- if (!preemptible()) {
+ if (preempt_count() != 0 || irqs_disabled()) {
/* Do an early check on signal validity. Otherwise,
* the error is lost in deferred irq_work.
*/
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 99fdeee3bcd8..650493ed76cd 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5420,9 +5420,10 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
/* Make a copy hash to place the new and the old entries in */
size = hash->count + direct_functions->count;
- if (size > 32)
- size = 32;
- new_hash = alloc_ftrace_hash(fls(size));
+ size = fls(size);
+ if (size > FTRACE_HASH_MAX_BITS)
+ size = FTRACE_HASH_MAX_BITS;
+ new_hash = alloc_ftrace_hash(size);
if (!new_hash)
goto out_unlock;
@@ -6325,6 +6326,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
}
}
}
+ cond_resched();
} while_for_each_ftrace_rec();
out:
mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 61caff3d4091..62d93db72b0a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5995,9 +5995,9 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested)
/* Ignore dropped events before test starts. */
if (started) {
if (nested)
- data->bytes_dropped += len;
- else
data->bytes_dropped_nested += len;
+ else
+ data->bytes_dropped += len;
}
return len;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9d9af60b238e..95868c315730 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7027,13 +7027,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
/* Copy the data into the page, so we can start over. */
ret = trace_seq_to_buffer(&iter->seq,
page_address(spd.pages[i]),
- trace_seq_used(&iter->seq));
+ min((size_t)trace_seq_used(&iter->seq),
+ PAGE_SIZE));
if (ret < 0) {
__free_page(spd.pages[i]);
break;
}
spd.partial[i].offset = 0;
- spd.partial[i].len = trace_seq_used(&iter->seq);
+ spd.partial[i].len = ret;
trace_seq_init(&iter->seq);
}
@@ -9417,7 +9418,8 @@ static int trace_array_create_dir(struct trace_array *tr)
return ret;
}
-static struct trace_array *trace_array_create(const char *name)
+static struct trace_array *
+trace_array_create_systems(const char *name, const char *systems)
{
struct trace_array *tr;
int ret;
@@ -9437,6 +9439,12 @@ static struct trace_array *trace_array_create(const char *name)
if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
goto out_free_tr;
+ if (systems) {
+ tr->system_names = kstrdup_const(systems, GFP_KERNEL);
+ if (!tr->system_names)
+ goto out_free_tr;
+ }
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9480,12 +9488,18 @@ static struct trace_array *trace_array_create(const char *name)
free_trace_buffers(tr);
free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
+ kfree_const(tr->system_names);
kfree(tr->name);
kfree(tr);
return ERR_PTR(ret);
}
+static struct trace_array *trace_array_create(const char *name)
+{
+ return trace_array_create_systems(name, NULL);
+}
+
static int instance_mkdir(const char *name)
{
struct trace_array *tr;
@@ -9511,6 +9525,7 @@ out_unlock:
/**
* trace_array_get_by_name - Create/Lookup a trace array, given its name.
* @name: The name of the trace array to be looked up/created.
+ * @systems: A list of systems to create event directories for (NULL for all)
*
* Returns pointer to trace array with given name.
* NULL, if it cannot be created.
@@ -9524,7 +9539,7 @@ out_unlock:
* trace_array_put() is called, user space can not delete it.
*
*/
-struct trace_array *trace_array_get_by_name(const char *name)
+struct trace_array *trace_array_get_by_name(const char *name, const char *systems)
{
struct trace_array *tr;
@@ -9536,7 +9551,7 @@ struct trace_array *trace_array_get_by_name(const char *name)
goto out_unlock;
}
- tr = trace_array_create(name);
+ tr = trace_array_create_systems(name, systems);
if (IS_ERR(tr))
tr = NULL;
@@ -9583,6 +9598,7 @@ static int __remove_instance(struct trace_array *tr)
free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
+ kfree_const(tr->system_names);
kfree(tr->name);
kfree(tr);
@@ -10301,7 +10317,7 @@ __init static void enable_instances(void)
if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
do_allocate_snapshot(tok);
- tr = trace_array_get_by_name(tok);
+ tr = trace_array_get_by_name(tok, NULL);
if (!tr) {
pr_warn("Failed to create instance buffer %s\n", curr_str);
continue;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e45756f1ac2b..db0d2641125e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -377,6 +377,7 @@ struct trace_array {
unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
unsigned int flags;
raw_spinlock_t start_lock;
+ const char *system_names;
struct list_head err_log;
struct dentry *dir;
struct dentry *options;
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 7ccc7a8e155b..dbe29b4c6a7a 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -633,7 +633,7 @@ trace_boot_init_instances(struct xbc_node *node)
if (!p || *p == '\0')
continue;
- tr = trace_array_get_by_name(p);
+ tr = trace_array_get_by_name(p, NULL);
if (!tr) {
pr_err("Failed to get trace instance %s\n", p);
continue;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9d22745cdea5..5f74e9f9c8a7 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -470,6 +470,7 @@ static void test_event_printk(struct trace_event_call *call)
case '%':
continue;
case 'p':
+ do_pointer:
/* Find dereferencing fields */
switch (fmt[i + 1]) {
case 'B': case 'R': case 'r':
@@ -498,6 +499,12 @@ static void test_event_printk(struct trace_event_call *call)
continue;
if (fmt[i + j] == '*') {
star = true;
+ /* Handle %*pbl case */
+ if (!j && fmt[i + 1] == 'p') {
+ arg++;
+ i++;
+ goto do_pointer;
+ }
continue;
}
if ((fmt[i + j] == 's')) {
@@ -790,7 +797,9 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
}
- call->class->reg(call, TRACE_REG_UNREGISTER, file);
+ ret = call->class->reg(call, TRACE_REG_UNREGISTER, file);
+
+ WARN_ON_ONCE(ret);
}
/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
if (file->flags & EVENT_FILE_FL_SOFT_MODE)
@@ -3056,6 +3065,41 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
up_write(&trace_event_sem);
}
+static bool event_in_systems(struct trace_event_call *call,
+ const char *systems)
+{
+ const char *system;
+ const char *p;
+
+ if (!systems)
+ return true;
+
+ system = call->class->system;
+ p = strstr(systems, system);
+ if (!p)
+ return false;
+
+ if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',')
+ return false;
+
+ p += strlen(system);
+ return !*p || isspace(*p) || *p == ',';
+}
+
+#ifdef CONFIG_HIST_TRIGGERS
+/*
+ * Wake up waiter on the hist_poll_wq from irq_work because the hist trigger
+ * may happen in any context.
+ */
+static void hist_poll_event_irq_work(struct irq_work *work)
+{
+ wake_up_all(&hist_poll_wq);
+}
+
+DEFINE_IRQ_WORK(hist_poll_work, hist_poll_event_irq_work);
+DECLARE_WAIT_QUEUE_HEAD(hist_poll_wq);
+#endif
+
static struct trace_event_file *
trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr)
@@ -3065,9 +3109,12 @@ trace_create_new_event(struct trace_event_call *call,
struct trace_event_file *file;
unsigned int first;
+ if (!event_in_systems(call, tr->system_names))
+ return NULL;
+
file = kmem_cache_alloc(file_cachep, GFP_TRACE);
if (!file)
- return NULL;
+ return ERR_PTR(-ENOMEM);
pid_list = rcu_dereference_protected(tr->filtered_pids,
lockdep_is_held(&event_mutex));
@@ -3132,8 +3179,17 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
struct trace_event_file *file;
file = trace_create_new_event(call, tr);
+ /*
+ * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+ * allocation, or NULL if the event is not part of the tr->system_names.
+ * When the event is not part of the tr->system_names, return zero, not
+ * an error.
+ */
if (!file)
- return -ENOMEM;
+ return 0;
+
+ if (IS_ERR(file))
+ return PTR_ERR(file);
if (eventdir_initialized)
return event_create_dir(tr->event_dir, file);
@@ -3172,8 +3228,17 @@ __trace_early_add_new_event(struct trace_event_call *call,
int ret;
file = trace_create_new_event(call, tr);
+ /*
+ * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+ * allocation, or NULL if the event is not part of the tr->system_names.
+ * When the event is not part of the tr->system_names, return zero, not
+ * an error.
+ */
if (!file)
- return -ENOMEM;
+ return 0;
+
+ if (IS_ERR(file))
+ return PTR_ERR(file);
ret = event_define_fields(call);
if (ret)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 0c611b281a5b..f50c2ad43f3d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -808,7 +808,7 @@ static __always_inline char *test_string(char *str)
kstr = ubuf->buffer;
/* For safety, do not trust the string pointer */
- if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE))
+ if (strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE) < 0)
return NULL;
return kstr;
}
@@ -827,7 +827,7 @@ static __always_inline char *test_ustring(char *str)
/* user space address? */
ustr = (char __user *)str;
- if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE))
+ if (strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE) < 0)
return NULL;
return kstr;
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 604d63380a90..e6f9cbc622c7 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -5322,6 +5322,8 @@ static void event_hist_trigger(struct event_trigger_data *data,
if (resolve_var_refs(hist_data, key, var_ref_vals, true))
hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals);
+
+ hist_poll_wakeup();
}
static void hist_trigger_stacktrace_print(struct seq_file *m,
@@ -5601,49 +5603,137 @@ static void hist_trigger_show(struct seq_file *m,
n_entries, (u64)atomic64_read(&hist_data->map->drops));
}
+struct hist_file_data {
+ struct file *file;
+ u64 last_read;
+ u64 last_act;
+};
+
+static u64 get_hist_hit_count(struct trace_event_file *event_file)
+{
+ struct hist_trigger_data *hist_data;
+ struct event_trigger_data *data;
+ u64 ret = 0;
+
+ list_for_each_entry(data, &event_file->triggers, list) {
+ if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ hist_data = data->private_data;
+ ret += atomic64_read(&hist_data->map->hits);
+ }
+ }
+ return ret;
+}
+
static int hist_show(struct seq_file *m, void *v)
{
+ struct hist_file_data *hist_file = m->private;
struct event_trigger_data *data;
struct trace_event_file *event_file;
- int n = 0, ret = 0;
+ int n = 0;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
- event_file = event_file_file(m->private);
- if (unlikely(!event_file)) {
- ret = -ENODEV;
- goto out_unlock;
- }
+ event_file = event_file_file(hist_file->file);
+ if (unlikely(!event_file))
+ return -ENODEV;
list_for_each_entry(data, &event_file->triggers, list) {
if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
hist_trigger_show(m, data, n++);
}
+ hist_file->last_read = get_hist_hit_count(event_file);
+ /*
+ * Update last_act too so that poll()/POLLPRI can wait for the next
+ * event after any syscall on hist file.
+ */
+ hist_file->last_act = hist_file->last_read;
- out_unlock:
- mutex_unlock(&event_mutex);
+ return 0;
+}
+
+static __poll_t event_hist_poll(struct file *file, struct poll_table_struct *wait)
+{
+ struct trace_event_file *event_file;
+ struct seq_file *m = file->private_data;
+ struct hist_file_data *hist_file = m->private;
+ __poll_t ret = 0;
+ u64 cnt;
+
+ guard(mutex)(&event_mutex);
+
+ event_file = event_file_data(file);
+ if (!event_file)
+ return EPOLLERR;
+
+ hist_poll_wait(file, wait);
+
+ cnt = get_hist_hit_count(event_file);
+ if (hist_file->last_read != cnt)
+ ret |= EPOLLIN | EPOLLRDNORM;
+ if (hist_file->last_act != cnt) {
+ hist_file->last_act = cnt;
+ ret |= EPOLLPRI;
+ }
return ret;
}
+static int event_hist_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+ struct hist_file_data *hist_file = m->private;
+
+ kfree(hist_file);
+ return tracing_single_release_file_tr(inode, file);
+}
+
static int event_hist_open(struct inode *inode, struct file *file)
{
+ struct trace_event_file *event_file;
+ struct hist_file_data *hist_file;
int ret;
ret = tracing_open_file_tr(inode, file);
if (ret)
return ret;
+ guard(mutex)(&event_mutex);
+
+ event_file = event_file_data(file);
+ if (!event_file) {
+ ret = -ENODEV;
+ goto err;
+ }
+
+ hist_file = kzalloc(sizeof(*hist_file), GFP_KERNEL);
+ if (!hist_file) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ hist_file->file = file;
+ hist_file->last_act = get_hist_hit_count(event_file);
+
/* Clear private_data to avoid warning in single_open() */
file->private_data = NULL;
- return single_open(file, hist_show, file);
+ ret = single_open(file, hist_show, hist_file);
+ if (ret) {
+ kfree(hist_file);
+ goto err;
+ }
+
+ return 0;
+err:
+ tracing_release_file_tr(inode, file);
+ return ret;
}
const struct file_operations event_hist_fops = {
.open = event_hist_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = tracing_single_release_file_tr,
+ .release = event_hist_release,
+ .poll = event_hist_poll,
};
#ifdef CONFIG_HIST_TRIGGERS_DEBUG
@@ -5884,25 +5974,19 @@ static int hist_debug_show(struct seq_file *m, void *v)
{
struct event_trigger_data *data;
struct trace_event_file *event_file;
- int n = 0, ret = 0;
+ int n = 0;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
event_file = event_file_file(m->private);
- if (unlikely(!event_file)) {
- ret = -ENODEV;
- goto out_unlock;
- }
+ if (unlikely(!event_file))
+ return -ENODEV;
list_for_each_entry(data, &event_file->triggers, list) {
if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
hist_trigger_debug_show(m, data, n++);
}
-
- out_unlock:
- mutex_unlock(&event_mutex);
-
- return ret;
+ return 0;
}
static int event_hist_debug_open(struct inode *inode, struct file *file)
@@ -5915,7 +5999,10 @@ static int event_hist_debug_open(struct inode *inode, struct file *file)
/* Clear private_data to avoid warning in single_open() */
file->private_data = NULL;
- return single_open(file, hist_debug_show, file);
+ ret = single_open(file, hist_debug_show, file);
+ if (ret)
+ tracing_release_file_tr(inode, file);
+ return ret;
}
const struct file_operations event_hist_debug_fops = {
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 624e0867316d..794e72dbb12d 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -312,7 +312,7 @@ static const char *synth_field_fmt(char *type)
else if (strcmp(type, "gfp_t") == 0)
fmt = "%x";
else if (synth_field_is_string(type))
- fmt = "%.*s";
+ fmt = "%s";
else if (synth_field_is_stack(type))
fmt = "%s";
@@ -377,7 +377,6 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
union trace_synth_field *data = &entry->fields[n_u64];
trace_seq_printf(s, print_fmt, se->fields[i]->name,
- STR_VAR_LEN_MAX,
(char *)entry + data->as_dynamic.offset,
i == se->n_fields - 1 ? "" : " ");
n_u64++;
@@ -859,6 +858,38 @@ static struct trace_event_fields synth_event_fields_array[] = {
{}
};
+static int synth_event_reg(struct trace_event_call *call,
+ enum trace_reg type, void *data)
+{
+ struct synth_event *event = container_of(call, struct synth_event, call);
+
+ switch (type) {
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+#endif
+ case TRACE_REG_REGISTER:
+ if (!try_module_get(event->mod))
+ return -EBUSY;
+ break;
+ default:
+ break;
+ }
+
+ int ret = trace_event_reg(call, type, data);
+
+ switch (type) {
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_UNREGISTER:
+#endif
+ case TRACE_REG_UNREGISTER:
+ module_put(event->mod);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
static int register_synth_event(struct synth_event *event)
{
struct trace_event_call *call = &event->call;
@@ -888,7 +919,7 @@ static int register_synth_event(struct synth_event *event)
goto out;
}
call->flags = TRACE_EVENT_FL_TRACEPOINT;
- call->class->reg = trace_event_reg;
+ call->class->reg = synth_event_reg;
call->class->probe = trace_event_raw_event_synth;
call->data = event;
call->tp = event->tp;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c35fbaab2a47..4d4808186a0f 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1317,6 +1317,7 @@ void graph_trace_close(struct trace_iterator *iter)
if (data) {
free_percpu(data->cpu_data);
kfree(data);
+ iter->private = NULL;
}
}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index ba37f768e2f2..6c9db857fe0e 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -231,8 +231,6 @@ static void irqsoff_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
- else
- iter->private = NULL;
}
static void irqsoff_trace_close(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index cc155590018f..5bd781359d38 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -2038,7 +2038,6 @@ static int start_kthread(unsigned int cpu)
if (IS_ERR(kthread)) {
pr_err(BANNER "could not start sampling thread\n");
- stop_per_cpu_kthreads();
return -ENOMEM;
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 2b948d35fb59..448ee37ae245 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -950,11 +950,12 @@ enum print_line_t print_event_fields(struct trace_iterator *iter,
struct trace_event_call *call;
struct list_head *head;
+ lockdep_assert_held_read(&trace_event_sem);
+
/* ftrace defined events have separate call structures */
if (event->type <= __TRACE_LAST_TYPE) {
bool found = false;
- down_read(&trace_event_sem);
list_for_each_entry(call, &ftrace_events, list) {
if (call->event.type == event->type) {
found = true;
@@ -964,7 +965,6 @@ enum print_line_t print_event_fields(struct trace_iterator *iter,
if (call->event.type > __TRACE_LAST_TYPE)
break;
}
- up_read(&trace_event_sem);
if (!found) {
trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type);
goto out;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 8c73156a7eb9..606190239c87 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -769,6 +769,10 @@ static int check_prepare_btf_string_fetch(char *typename,
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+/*
+ * Add the entry code to store the 'argnum'th parameter and return the offset
+ * in the entry data buffer where the data will be stored.
+ */
static int __store_entry_arg(struct trace_probe *tp, int argnum)
{
struct probe_entry_arg *earg = tp->entry_arg;
@@ -792,6 +796,20 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum)
tp->entry_arg = earg;
}
+ /*
+ * The entry code array is repeating the pair of
+ * [FETCH_OP_ARG(argnum)][FETCH_OP_ST_EDATA(offset of entry data buffer)]
+ * and the rest of entries are filled with [FETCH_OP_END].
+ *
+ * To reduce the redundant function parameter fetching, we scan the entry
+ * code array to find the FETCH_OP_ARG which already fetches the 'argnum'
+ * parameter. If it doesn't match, update 'offset' to find the last
+ * offset.
+ * If we find the FETCH_OP_END without matching FETCH_OP_ARG entry, we
+ * will save the entry with FETCH_OP_ARG and FETCH_OP_ST_EDATA, and
+ * return data offset so that caller can find the data offset in the entry
+ * data buffer.
+ */
offset = 0;
for (i = 0; i < earg->size - 1; i++) {
switch (earg->code[i].op) {
@@ -825,6 +843,16 @@ int traceprobe_get_entry_data_size(struct trace_probe *tp)
if (!earg)
return 0;
+ /*
+ * earg->code[] array has an operation sequence which is run in
+ * the entry handler.
+ * The sequence stopped by FETCH_OP_END and each data stored in
+ * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA
+ * stores the data at the data buffer + its offset, and all data are
+ * "unsigned long" size. The offset must be increased when a data is
+ * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the
+ * code array.
+ */
for (i = 0; i < earg->size; i++) {
switch (earg->code[i].op) {
case FETCH_OP_END:
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0469a04a355f..330aee1c1a49 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -168,8 +168,6 @@ static void wakeup_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
- else
- iter->private = NULL;
}
static void wakeup_trace_close(struct trace_iterator *iter)
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 778b4056700f..17254597accd 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -269,6 +269,15 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
if (ret < 0)
goto error;
+ /*
+ * pipe_resize_ring() does not update nr_accounted for watch_queue
+ * pipes, because the above vastly overprovisions. Set nr_accounted on
+ * and max_usage this pipe to the number that was actually charged to
+ * the user above via account_pipe_buffers.
+ */
+ pipe->max_usage = nr_pages;
+ pipe->nr_accounted = nr_pages;
+
ret = -ENOMEM;
pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL);
if (!pages)