summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt3
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/audit.c160
-rw-r--r--kernel/auditsc.c29
-rw-r--r--kernel/bpf/Makefile12
-rw-r--r--kernel/bpf/arena.c405
-rw-r--r--kernel/bpf/arraymap.c29
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c62
-rw-r--r--kernel/bpf/bpf_inode_storage.c6
-rw-r--r--kernel/bpf/bpf_insn_array.c4
-rw-r--r--kernel/bpf/bpf_iter.c2
-rw-r--r--kernel/bpf/bpf_local_storage.c408
-rw-r--r--kernel/bpf/bpf_lsm.c5
-rw-r--r--kernel/bpf/bpf_lsm_proto.c19
-rw-r--r--kernel/bpf/bpf_struct_ops.c88
-rw-r--r--kernel/bpf/bpf_task_storage.c154
-rw-r--r--kernel/bpf/btf.c228
-rw-r--r--kernel/bpf/cgroup.c6
-rw-r--r--kernel/bpf/cgroup_iter.c26
-rw-r--r--kernel/bpf/core.c20
-rw-r--r--kernel/bpf/cpumap.c21
-rw-r--r--kernel/bpf/cpumask.c2
-rw-r--r--kernel/bpf/crypto.c10
-rw-r--r--kernel/bpf/hashtab.c105
-rw-r--r--kernel/bpf/helpers.c698
-rw-r--r--kernel/bpf/inode.c42
-rw-r--r--kernel/bpf/local_storage.c27
-rw-r--r--kernel/bpf/map_iter.c2
-rw-r--r--kernel/bpf/offload.c12
-rw-r--r--kernel/bpf/range_tree.c5
-rw-r--r--kernel/bpf/ringbuf.c1
-rw-r--r--kernel/bpf/rqspinlock.c8
-rw-r--r--kernel/bpf/stream.c24
-rw-r--r--kernel/bpf/syscall.c172
-rw-r--r--kernel/bpf/tnum.c16
-rw-r--r--kernel/bpf/token.c1
-rw-r--r--kernel/bpf/trampoline.c320
-rw-r--r--kernel/bpf/verifier.c1471
-rw-r--r--kernel/cgroup/cgroup-internal.h8
-rw-r--r--kernel/cgroup/cgroup-v1.c12
-rw-r--r--kernel/cgroup/cgroup.c50
-rw-r--r--kernel/cgroup/cpuset-internal.h54
-rw-r--r--kernel/cgroup/cpuset-v1.c271
-rw-r--r--kernel/cgroup/cpuset.c559
-rw-r--r--kernel/cgroup/debug.c2
-rw-r--r--kernel/cgroup/dmem.c70
-rw-r--r--kernel/configs/debug.config2
-rw-r--r--kernel/context_tracking.c20
-rw-r--r--kernel/cpu.c42
-rw-r--r--kernel/crash_core.c17
-rw-r--r--kernel/crash_dump_dm_crypt.c21
-rw-r--r--kernel/cred.c23
-rw-r--r--kernel/debug/gdbstub.c1
-rw-r--r--kernel/delayacct.c33
-rw-r--r--kernel/dma/contiguous.c26
-rw-r--r--kernel/dma/mapping.c4
-rw-r--r--kernel/dma/pool.c34
-rw-r--r--kernel/entry/common.c27
-rw-r--r--kernel/entry/common.h7
-rw-r--r--kernel/entry/syscall-common.c97
-rw-r--r--kernel/entry/syscall_user_dispatch.c4
-rw-r--r--kernel/events/callchain.c2
-rw-r--r--kernel/events/core.c559
-rw-r--r--kernel/events/uprobes.c24
-rw-r--r--kernel/fork.c13
-rw-r--r--kernel/irq/Kconfig3
-rw-r--r--kernel/irq/Makefile4
-rw-r--r--kernel/irq/chip.c46
-rw-r--r--kernel/irq/cpuhotplug.c6
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/internals.h112
-rw-r--r--kernel/irq/irqdesc.c95
-rw-r--r--kernel/irq/irqdomain.c17
-rw-r--r--kernel/irq/manage.c96
-rw-r--r--kernel/irq/proc.c3
-rw-r--r--kernel/irq/timings.c959
-rw-r--r--kernel/kallsyms.c79
-rw-r--r--kernel/kallsyms_internal.h1
-rw-r--r--kernel/kcov.c36
-rw-r--r--kernel/kcsan/Makefile2
-rw-r--r--kernel/kcsan/kcsan_test.c4
-rw-r--r--kernel/kcsan/report.c11
-rw-r--r--kernel/kexec_file.c131
-rw-r--r--kernel/kthread.c160
-rw-r--r--kernel/livepatch/core.c19
-rw-r--r--kernel/liveupdate/Kconfig17
-rw-r--r--kernel/liveupdate/Makefile1
-rw-r--r--kernel/liveupdate/kexec_handover.c159
-rw-r--r--kernel/liveupdate/luo_core.c10
-rw-r--r--kernel/liveupdate/luo_file.c41
-rw-r--r--kernel/liveupdate/luo_flb.c654
-rw-r--r--kernel/liveupdate/luo_internal.h22
-rw-r--r--kernel/locking/test-ww_mutex.c165
-rw-r--r--kernel/module/Kconfig5
-rw-r--r--kernel/module/decompress.c10
-rw-r--r--kernel/module/dups.c4
-rw-r--r--kernel/module/kallsyms.c9
-rw-r--r--kernel/padata.c22
-rw-r--r--kernel/panic.c168
-rw-r--r--kernel/params.c15
-rw-r--r--kernel/pid.c131
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/swap.c18
-rw-r--r--kernel/printk/internal.h8
-rw-r--r--kernel/printk/nbcon.c23
-rw-r--r--kernel/printk/printk.c56
-rw-r--r--kernel/printk/printk_ringbuffer.h5
-rw-r--r--kernel/rcu/Kconfig43
-rw-r--r--kernel/rcu/rcu.h9
-rw-r--r--kernel/rcu/rcuscale.c7
-rw-r--r--kernel/rcu/rcutorture.c10
-rw-r--r--kernel/rcu/srcutree.c2
-rw-r--r--kernel/rcu/tasks.h708
-rw-r--r--kernel/rcu/tree.c14
-rw-r--r--kernel/rcu/tree.h5
-rw-r--r--kernel/rcu/tree_exp.h7
-rw-r--r--kernel/rcu/tree_nocb.h80
-rw-r--r--kernel/rcu/tree_plugin.h15
-rw-r--r--kernel/resource.c73
-rw-r--r--kernel/rseq.c365
-rw-r--r--kernel/sched/Makefile3
-rw-r--r--kernel/sched/clock.c3
-rw-r--r--kernel/sched/core.c363
-rw-r--r--kernel/sched/cpufreq_schedutil.c2
-rw-r--r--kernel/sched/cputime.c22
-rw-r--r--kernel/sched/deadline.c117
-rw-r--r--kernel/sched/debug.c187
-rw-r--r--kernel/sched/ext.c102
-rw-r--r--kernel/sched/fair.c434
-rw-r--r--kernel/sched/features.h2
-rw-r--r--kernel/sched/idle.c13
-rw-r--r--kernel/sched/isolation.c145
-rw-r--r--kernel/sched/rt.c14
-rw-r--r--kernel/sched/sched.h325
-rw-r--r--kernel/sched/stats.h8
-rw-r--r--kernel/sched/stop_task.c3
-rw-r--r--kernel/sched/topology.c5
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/sys.c10
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/hrtimer.c18
-rw-r--r--kernel/time/posix-timers.c13
-rw-r--r--kernel/time/sched_clock.c2
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-sched.c11
-rw-r--r--kernel/time/time_test.c4
-rw-r--r--kernel/time/timecounter.c35
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/time/timer.c2
-rw-r--r--kernel/time/timer_migration.c25
-rw-r--r--kernel/trace/Kconfig3
-rw-r--r--kernel/trace/blktrace.c2
-rw-r--r--kernel/trace/bpf_trace.c84
-rw-r--r--kernel/trace/ftrace.c412
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp.c30
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp.h22
-rw-r--r--kernel/trace/rv/monitors/opid/opid.c40
-rw-r--r--kernel/trace/rv/monitors/opid/opid.h24
-rw-r--r--kernel/trace/rv/monitors/rtapp/rtapp.c2
-rw-r--r--kernel/trace/rv/monitors/sched/sched.c2
-rw-r--r--kernel/trace/rv/monitors/sco/sco.c26
-rw-r--r--kernel/trace/rv/monitors/sco/sco.h14
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.c28
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.h14
-rw-r--r--kernel/trace/rv/monitors/snep/snep.c28
-rw-r--r--kernel/trace/rv/monitors/snep/snep.h18
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.c26
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.h14
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw.c38
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw.h22
-rw-r--r--kernel/trace/rv/monitors/sts/sts.c34
-rw-r--r--kernel/trace/rv/monitors/sts/sts.h28
-rw-r--r--kernel/trace/rv/monitors/wip/wip.c26
-rw-r--r--kernel/trace/rv/monitors/wip/wip.h14
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.c28
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.h14
-rw-r--r--kernel/trace/trace.c15
-rw-r--r--kernel/trace/trace.h9
-rw-r--r--kernel/trace/trace_entries.h32
-rw-r--r--kernel/trace/trace_events_hist.c9
-rw-r--r--kernel/trace/trace_events_synth.c8
-rw-r--r--kernel/trace/trace_export.c21
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/ucount.c2
-rw-r--r--kernel/unwind/user.c12
-rw-r--r--kernel/vmcore_info.c13
-rw-r--r--kernel/watchdog.c12
-rw-r--r--kernel/watchdog_perf.c50
-rw-r--r--kernel/workqueue.c166
192 files changed, 8916 insertions, 5569 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index da326800c1c9..88c594c6d7fc 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -16,11 +16,13 @@ config ARCH_HAS_PREEMPT_LAZY
choice
prompt "Preemption Model"
+ default PREEMPT_LAZY if ARCH_HAS_PREEMPT_LAZY
default PREEMPT_NONE
config PREEMPT_NONE
bool "No Forced Preemption (Server)"
depends on !PREEMPT_RT
+ depends on ARCH_NO_PREEMPT
select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
help
This is the traditional Linux preemption model, geared towards
@@ -35,6 +37,7 @@ config PREEMPT_NONE
config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
+ depends on !ARCH_HAS_PREEMPT_LAZY
depends on !ARCH_NO_PREEMPT
depends on !PREEMPT_RT
select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
diff --git a/kernel/Makefile b/kernel/Makefile
index e83669841b8c..6785982013dc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,6 +43,8 @@ KASAN_SANITIZE_kcov.o := n
KCSAN_SANITIZE_kcov.o := n
UBSAN_SANITIZE_kcov.o := n
KMSAN_SANITIZE_kcov.o := n
+
+CONTEXT_ANALYSIS_kcov.o := y
CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
obj-y += sched/
diff --git a/kernel/acct.c b/kernel/acct.c
index 2a2b3c874acd..812808e5b1b8 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -218,7 +218,6 @@ static int acct_on(const char __user *name)
/* Difference from BSD - they don't do O_APPEND */
const int open_flags = O_WRONLY|O_APPEND|O_LARGEFILE;
struct pid_namespace *ns = task_active_pid_ns(current);
- struct filename *pathname __free(putname) = getname(name);
struct file *original_file __free(fput) = NULL; // in that order
struct path internal __free(path_put) = {}; // in that order
struct file *file __free(fput_sync) = NULL; // in that order
@@ -226,8 +225,7 @@ static int acct_on(const char __user *name)
struct vfsmount *mnt;
struct fs_pin *old;
- if (IS_ERR(pathname))
- return PTR_ERR(pathname);
+ CLASS(filename, pathname)(name);
original_file = file_open_name(pathname, open_flags, 0);
if (IS_ERR(original_file))
return PTR_ERR(original_file);
diff --git a/kernel/audit.c b/kernel/audit.c
index 26a332ffb1b8..592d927e70f9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -32,6 +32,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/file.h>
+#include <linux/hex.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/atomic.h>
@@ -58,6 +59,9 @@
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <net/netns/generic.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <linux/sctp.h>
#include "audit.h"
@@ -2488,6 +2492,162 @@ void audit_log_path_denied(int type, const char *operation)
audit_log_end(ab);
}
+int audit_log_nf_skb(struct audit_buffer *ab,
+ const struct sk_buff *skb, u8 nfproto)
+{
+ /* find the IP protocol in the case of NFPROTO_BRIDGE */
+ if (nfproto == NFPROTO_BRIDGE) {
+ switch (eth_hdr(skb)->h_proto) {
+ case htons(ETH_P_IP):
+ nfproto = NFPROTO_IPV4;
+ break;
+ case htons(ETH_P_IPV6):
+ nfproto = NFPROTO_IPV6;
+ break;
+ default:
+ goto unknown_proto;
+ }
+ }
+
+ switch (nfproto) {
+ case NFPROTO_IPV4: {
+ struct iphdr iph;
+ const struct iphdr *ih;
+
+ ih = skb_header_pointer(skb, skb_network_offset(skb),
+ sizeof(iph), &iph);
+ if (!ih)
+ return -ENOMEM;
+
+ switch (ih->protocol) {
+ case IPPROTO_TCP: {
+ struct tcphdr _tcph;
+ const struct tcphdr *th;
+
+ th = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_tcph), &_tcph);
+ if (!th)
+ return -ENOMEM;
+
+ audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu sport=%hu dport=%hu",
+ &ih->saddr, &ih->daddr, ih->protocol,
+ ntohs(th->source), ntohs(th->dest));
+ break;
+ }
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE: {
+ struct udphdr _udph;
+ const struct udphdr *uh;
+
+ uh = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_udph), &_udph);
+ if (!uh)
+ return -ENOMEM;
+
+ audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu sport=%hu dport=%hu",
+ &ih->saddr, &ih->daddr, ih->protocol,
+ ntohs(uh->source), ntohs(uh->dest));
+ break;
+ }
+ case IPPROTO_SCTP: {
+ struct sctphdr _sctph;
+ const struct sctphdr *sh;
+
+ sh = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_sctph), &_sctph);
+ if (!sh)
+ return -ENOMEM;
+
+ audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu sport=%hu dport=%hu",
+ &ih->saddr, &ih->daddr, ih->protocol,
+ ntohs(sh->source), ntohs(sh->dest));
+ break;
+ }
+ default:
+ audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu",
+ &ih->saddr, &ih->daddr, ih->protocol);
+ }
+
+ break;
+ }
+ case NFPROTO_IPV6: {
+ struct ipv6hdr iph;
+ const struct ipv6hdr *ih;
+ u8 nexthdr;
+ __be16 frag_off;
+
+ ih = skb_header_pointer(skb, skb_network_offset(skb),
+ sizeof(iph), &iph);
+ if (!ih)
+ return -ENOMEM;
+
+ nexthdr = ih->nexthdr;
+ ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(iph),
+ &nexthdr, &frag_off);
+
+ switch (nexthdr) {
+ case IPPROTO_TCP: {
+ struct tcphdr _tcph;
+ const struct tcphdr *th;
+
+ th = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_tcph), &_tcph);
+ if (!th)
+ return -ENOMEM;
+
+ audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu sport=%hu dport=%hu",
+ &ih->saddr, &ih->daddr, nexthdr,
+ ntohs(th->source), ntohs(th->dest));
+ break;
+ }
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE: {
+ struct udphdr _udph;
+ const struct udphdr *uh;
+
+ uh = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_udph), &_udph);
+ if (!uh)
+ return -ENOMEM;
+
+ audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu sport=%hu dport=%hu",
+ &ih->saddr, &ih->daddr, nexthdr,
+ ntohs(uh->source), ntohs(uh->dest));
+ break;
+ }
+ case IPPROTO_SCTP: {
+ struct sctphdr _sctph;
+ const struct sctphdr *sh;
+
+ sh = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_sctph), &_sctph);
+ if (!sh)
+ return -ENOMEM;
+
+ audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu sport=%hu dport=%hu",
+ &ih->saddr, &ih->daddr, nexthdr,
+ ntohs(sh->source), ntohs(sh->dest));
+ break;
+ }
+ default:
+ audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu",
+ &ih->saddr, &ih->daddr, nexthdr);
+ }
+
+ break;
+ }
+ default:
+ goto unknown_proto;
+ }
+
+ return 0;
+
+unknown_proto:
+ audit_log_format(ab, " saddr=? daddr=? proto=?");
+ return -EPFNOSUPPORT;
+}
+EXPORT_SYMBOL(audit_log_nf_skb);
+
/* global counter which is incremented every time something logs in */
static atomic_t session_id = ATOMIC_INIT(0);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index dd0563a8e0be..86a44b162a87 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2170,29 +2170,6 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
}
/**
- * __audit_reusename - fill out filename with info from existing entry
- * @uptr: userland ptr to pathname
- *
- * Search the audit_names list for the current audit context. If there is an
- * existing entry with a matching "uptr" then return the filename
- * associated with that audit_name. If not, return NULL.
- */
-struct filename *
-__audit_reusename(const __user char *uptr)
-{
- struct audit_context *context = audit_context();
- struct audit_names *n;
-
- list_for_each_entry(n, &context->names_list, list) {
- if (!n->name)
- continue;
- if (n->name->uptr == uptr)
- return refname(n->name);
- }
- return NULL;
-}
-
-/**
* __audit_getname - add a name to the list
* @name: name to add
*
@@ -2214,7 +2191,7 @@ void __audit_getname(struct filename *name)
n->name = name;
n->name_len = AUDIT_NAME_FULL;
name->aname = n;
- refname(name);
+ name->refcnt++;
}
static inline int audit_copy_fcaps(struct audit_names *name,
@@ -2346,7 +2323,7 @@ out_alloc:
return;
if (name) {
n->name = name;
- refname(name);
+ name->refcnt++;
}
out:
@@ -2468,7 +2445,7 @@ void __audit_inode_child(struct inode *parent,
if (found_parent) {
found_child->name = found_parent->name;
found_child->name_len = AUDIT_NAME_FULL;
- refname(found_child->name);
+ found_child->name->refcnt++;
}
}
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 232cbc97434d..79cf22860a99 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -42,7 +42,17 @@ endif
ifeq ($(CONFIG_BPF_JIT),y)
obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
-obj-${CONFIG_BPF_LSM} += bpf_lsm.o
+# bpf_lsm_proto.o must precede bpf_lsm.o. The current pahole logic
+# deduplicates function prototypes within
+# btf_encoder__add_saved_func() by keeping the first instance seen. We
+# need the function prototype(s) in bpf_lsm_proto.o to take precedence
+# over those within bpf_lsm.o. Having bpf_lsm_proto.o precede
+# bpf_lsm.o ensures its DWARF CU is processed early, forcing the
+# generated BTF to contain the overrides.
+#
+# Notably, this is a temporary workaround whilst the deduplication
+# semantics within pahole are revisited accordingly.
+obj-${CONFIG_BPF_LSM} += bpf_lsm_proto.o bpf_lsm.o
endif
ifneq ($(CONFIG_CRYPTO),)
obj-$(CONFIG_BPF_SYSCALL) += crypto.o
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 872dc0e41c65..42fae0a9f314 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -2,11 +2,15 @@
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#include <linux/bpf.h>
#include <linux/btf.h>
+#include <linux/cacheflush.h>
#include <linux/err.h>
+#include <linux/irq_work.h>
#include "linux/filter.h"
+#include <linux/llist.h>
#include <linux/btf_ids.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
+#include <asm/tlbflush.h>
#include "range_tree.h"
/*
@@ -42,14 +46,31 @@
#define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1)
#define KERN_VM_SZ (SZ_4G + GUARD_SZ)
+static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable);
+
struct bpf_arena {
struct bpf_map map;
u64 user_vm_start;
u64 user_vm_end;
struct vm_struct *kern_vm;
struct range_tree rt;
+ /* protects rt */
+ rqspinlock_t spinlock;
struct list_head vma_list;
+ /* protects vma_list */
struct mutex lock;
+ struct irq_work free_irq;
+ struct work_struct free_work;
+ struct llist_head free_spans;
+};
+
+static void arena_free_worker(struct work_struct *work);
+static void arena_free_irq(struct irq_work *iw);
+
+struct arena_free_span {
+ struct llist_node node;
+ unsigned long uaddr;
+ u32 page_cnt;
};
u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
@@ -92,6 +113,66 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr)
return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
}
+struct apply_range_data {
+ struct page **pages;
+ int i;
+};
+
+static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
+{
+ struct apply_range_data *d = data;
+ struct page *page;
+
+ if (!data)
+ return 0;
+ /* sanity check */
+ if (unlikely(!pte_none(ptep_get(pte))))
+ return -EBUSY;
+
+ page = d->pages[d->i];
+ /* paranoia, similar to vmap_pages_pte_range() */
+ if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page))))
+ return -EINVAL;
+
+ set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
+ d->i++;
+ return 0;
+}
+
+static void flush_vmap_cache(unsigned long start, unsigned long size)
+{
+ flush_cache_vmap(start, start + size);
+}
+
+static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages)
+{
+ pte_t old_pte;
+ struct page *page;
+
+ /* sanity check */
+ old_pte = ptep_get(pte);
+ if (pte_none(old_pte) || !pte_present(old_pte))
+ return 0; /* nothing to do */
+
+ page = pte_page(old_pte);
+ if (WARN_ON_ONCE(!page))
+ return -EINVAL;
+
+ pte_clear(&init_mm, addr, pte);
+
+ /* Add page to the list so it is freed later */
+ if (free_pages)
+ __llist_add(&page->pcp_llist, free_pages);
+
+ return 0;
+}
+
+static int populate_pgtable_except_pte(struct bpf_arena *arena)
+{
+ return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
+ KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
+}
+
static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
{
struct vm_struct *kern_vm;
@@ -136,6 +217,9 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
arena->user_vm_end = arena->user_vm_start + vm_range;
INIT_LIST_HEAD(&arena->vma_list);
+ init_llist_head(&arena->free_spans);
+ init_irq_work(&arena->free_irq, arena_free_irq);
+ INIT_WORK(&arena->free_work, arena_free_worker);
bpf_map_init_from_attr(&arena->map, attr);
range_tree_init(&arena->rt);
err = range_tree_set(&arena->rt, 0, attr->max_entries);
@@ -144,6 +228,13 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
goto err;
}
mutex_init(&arena->lock);
+ raw_res_spin_lock_init(&arena->spinlock);
+ err = populate_pgtable_except_pte(arena);
+ if (err) {
+ range_tree_destroy(&arena->rt);
+ bpf_map_area_free(arena);
+ goto err;
+ }
return &arena->map;
err:
@@ -184,6 +275,10 @@ static void arena_map_free(struct bpf_map *map)
if (WARN_ON_ONCE(!list_empty(&arena->vma_list)))
return;
+ /* Ensure no pending deferred frees */
+ irq_work_sync(&arena->free_irq);
+ flush_work(&arena->free_work);
+
/*
* free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area().
* It unmaps everything from vmalloc area and clears pgtables.
@@ -265,44 +360,59 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
{
struct bpf_map *map = vmf->vma->vm_file->private_data;
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+ struct mem_cgroup *new_memcg, *old_memcg;
struct page *page;
long kbase, kaddr;
+ unsigned long flags;
int ret;
kbase = bpf_arena_get_kern_vm_start(arena);
kaddr = kbase + (u32)(vmf->address);
- guard(mutex)(&arena->lock);
+ if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
+ /* Make a reasonable effort to address impossible case */
+ return VM_FAULT_RETRY;
+
page = vmalloc_to_page((void *)kaddr);
if (page)
/* already have a page vmap-ed */
goto out;
+ bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
+
if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
/* User space requested to segfault when page is not allocated by bpf prog */
- return VM_FAULT_SIGSEGV;
+ goto out_unlock_sigsegv;
ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
if (ret)
- return VM_FAULT_SIGSEGV;
+ goto out_unlock_sigsegv;
+ struct apply_range_data data = { .pages = &page, .i = 0 };
/* Account into memcg of the process that created bpf_arena */
ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
- return VM_FAULT_SIGSEGV;
+ goto out_unlock_sigsegv;
}
- ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
+ ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
- __free_page(page);
- return VM_FAULT_SIGSEGV;
+ free_pages_nolock(page, 0);
+ goto out_unlock_sigsegv;
}
+ flush_vmap_cache(kaddr, PAGE_SIZE);
+ bpf_map_memcg_exit(old_memcg, new_memcg);
out:
page_ref_add(page, 1);
+ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
vmf->page = page;
return 0;
+out_unlock_sigsegv:
+ bpf_map_memcg_exit(old_memcg, new_memcg);
+ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
+ return VM_FAULT_SIGSEGV;
}
static const struct vm_operations_struct arena_vm_ops = {
@@ -423,12 +533,18 @@ static u64 clear_lo32(u64 val)
* Allocate pages and vmap them into kernel vmalloc area.
* Later the pages will be mmaped into user space vma.
*/
-static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id)
+static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id,
+ bool sleepable)
{
/* user_vm_end/start are fixed before bpf prog runs */
long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
- struct page **pages;
+ struct mem_cgroup *new_memcg, *old_memcg;
+ struct apply_range_data data;
+ struct page **pages = NULL;
+ long remaining, mapped = 0;
+ long alloc_pages;
+ unsigned long flags;
long pgoff = 0;
u32 uaddr32;
int ret, i;
@@ -445,17 +561,23 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
return 0;
}
- /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */
- pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL);
- if (!pages)
+ bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
+ /* Cap allocation size to KMALLOC_MAX_CACHE_SIZE so kmalloc_nolock() can succeed. */
+ alloc_pages = min(page_cnt, KMALLOC_MAX_CACHE_SIZE / sizeof(struct page *));
+ pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), __GFP_ACCOUNT, NUMA_NO_NODE);
+ if (!pages) {
+ bpf_map_memcg_exit(old_memcg, new_memcg);
return 0;
+ }
+ data.pages = pages;
- guard(mutex)(&arena->lock);
+ if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
+ goto out_free_pages;
if (uaddr) {
ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
if (ret)
- goto out_free_pages;
+ goto out_unlock_free_pages;
ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
} else {
ret = pgoff = range_tree_find(&arena->rt, page_cnt);
@@ -463,33 +585,62 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
}
if (ret)
- goto out_free_pages;
-
- ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
- if (ret)
- goto out;
+ goto out_unlock_free_pages;
+ remaining = page_cnt;
uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE);
- /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
- * will not overflow 32-bit. Lower 32-bit need to represent
- * contiguous user address range.
- * Map these pages at kern_vm_start base.
- * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
- * lower 32-bit and it's ok.
- */
- ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
- kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
- if (ret) {
- for (i = 0; i < page_cnt; i++)
- __free_page(pages[i]);
- goto out;
+
+ while (remaining) {
+ long this_batch = min(remaining, alloc_pages);
+
+ /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */
+ memset(pages, 0, this_batch * sizeof(struct page *));
+
+ ret = bpf_map_alloc_pages(&arena->map, node_id, this_batch, pages);
+ if (ret)
+ goto out;
+
+ /*
+ * Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
+ * will not overflow 32-bit. Lower 32-bit need to represent
+ * contiguous user address range.
+ * Map these pages at kern_vm_start base.
+ * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
+ * lower 32-bit and it's ok.
+ */
+ data.i = 0;
+ ret = apply_to_page_range(&init_mm,
+ kern_vm_start + uaddr32 + (mapped << PAGE_SHIFT),
+ this_batch << PAGE_SHIFT, apply_range_set_cb, &data);
+ if (ret) {
+ /* data.i pages were mapped, account them and free the remaining */
+ mapped += data.i;
+ for (i = data.i; i < this_batch; i++)
+ free_pages_nolock(pages[i], 0);
+ goto out;
+ }
+
+ mapped += this_batch;
+ remaining -= this_batch;
}
- kvfree(pages);
+ flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT);
+ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
+ kfree_nolock(pages);
+ bpf_map_memcg_exit(old_memcg, new_memcg);
return clear_lo32(arena->user_vm_start) + uaddr32;
out:
- range_tree_set(&arena->rt, pgoff, page_cnt);
+ range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped);
+ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
+ if (mapped) {
+ flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT);
+ arena_free_pages(arena, uaddr32, mapped, sleepable);
+ }
+ goto out_free_pages;
+out_unlock_free_pages:
+ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
out_free_pages:
- kvfree(pages);
+ kfree_nolock(pages);
+ bpf_map_memcg_exit(old_memcg, new_memcg);
return 0;
}
@@ -502,42 +653,66 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
{
struct vma_list *vml;
+ guard(mutex)(&arena->lock);
+ /* iterate link list under lock */
list_for_each_entry(vml, &arena->vma_list, head)
zap_page_range_single(vml->vma, uaddr,
PAGE_SIZE * page_cnt, NULL);
}
-static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
+static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable)
{
+ struct mem_cgroup *new_memcg, *old_memcg;
u64 full_uaddr, uaddr_end;
- long kaddr, pgoff, i;
+ long kaddr, pgoff;
struct page *page;
+ struct llist_head free_pages;
+ struct llist_node *pos, *t;
+ struct arena_free_span *s;
+ unsigned long flags;
+ int ret = 0;
/* only aligned lower 32-bit are relevant */
uaddr = (u32)uaddr;
uaddr &= PAGE_MASK;
+ kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
full_uaddr = clear_lo32(arena->user_vm_start) + uaddr;
uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT));
if (full_uaddr >= uaddr_end)
return;
page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
+ pgoff = compute_pgoff(arena, uaddr);
+ bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
- guard(mutex)(&arena->lock);
+ if (!sleepable)
+ goto defer;
+
+ ret = raw_res_spin_lock_irqsave(&arena->spinlock, flags);
+
+ /* Can't proceed without holding the spinlock so defer the free */
+ if (ret)
+ goto defer;
- pgoff = compute_pgoff(arena, uaddr);
- /* clear range */
range_tree_set(&arena->rt, pgoff, page_cnt);
+ init_llist_head(&free_pages);
+ /* clear ptes and collect struct pages */
+ apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
+ apply_range_clear_cb, &free_pages);
+
+ /* drop the lock to do the tlb flush and zap pages */
+ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
+
+ /* ensure no stale TLB entries */
+ flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE));
+
if (page_cnt > 1)
/* bulk zap if multiple pages being freed */
zap_pages(arena, full_uaddr, page_cnt);
- kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
- for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) {
- page = vmalloc_to_page((void *)kaddr);
- if (!page)
- continue;
+ llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) {
+ page = llist_entry(pos, struct page, pcp_llist);
if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */
/* Optimization for the common case of page_cnt==1:
* If page wasn't mapped into some user vma there
@@ -545,9 +720,27 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
* page_cnt is big it's faster to do the batched zap.
*/
zap_pages(arena, full_uaddr, 1);
- vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
__free_page(page);
}
+ bpf_map_memcg_exit(old_memcg, new_memcg);
+
+ return;
+
+defer:
+ s = kmalloc_nolock(sizeof(struct arena_free_span), __GFP_ACCOUNT, -1);
+ bpf_map_memcg_exit(old_memcg, new_memcg);
+ if (!s)
+ /*
+ * If allocation fails in non-sleepable context, pages are intentionally left
+ * inaccessible (leaked) until the arena is destroyed. Cleanup or retries are not
+ * possible here, so we intentionally omit them for safety.
+ */
+ return;
+
+ s->page_cnt = page_cnt;
+ s->uaddr = uaddr;
+ llist_add(&s->node, &arena->free_spans);
+ irq_work_queue(&arena->free_irq);
}
/*
@@ -557,6 +750,8 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt)
{
long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
+ struct mem_cgroup *new_memcg, *old_memcg;
+ unsigned long flags;
long pgoff;
int ret;
@@ -567,15 +762,94 @@ static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt
if (pgoff + page_cnt > page_cnt_max)
return -EINVAL;
- guard(mutex)(&arena->lock);
+ if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
+ return -EBUSY;
/* Cannot guard already allocated pages. */
ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
- if (ret)
- return -EBUSY;
+ if (ret) {
+ ret = -EBUSY;
+ goto out;
+ }
/* "Allocate" the region to prevent it from being allocated. */
- return range_tree_clear(&arena->rt, pgoff, page_cnt);
+ bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
+ ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
+ bpf_map_memcg_exit(old_memcg, new_memcg);
+out:
+ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
+ return ret;
+}
+
+static void arena_free_worker(struct work_struct *work)
+{
+ struct bpf_arena *arena = container_of(work, struct bpf_arena, free_work);
+ struct mem_cgroup *new_memcg, *old_memcg;
+ struct llist_node *list, *pos, *t;
+ struct arena_free_span *s;
+ u64 arena_vm_start, user_vm_start;
+ struct llist_head free_pages;
+ struct page *page;
+ unsigned long full_uaddr;
+ long kaddr, page_cnt, pgoff;
+ unsigned long flags;
+
+ if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) {
+ schedule_work(work);
+ return;
+ }
+
+ bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
+
+ init_llist_head(&free_pages);
+ arena_vm_start = bpf_arena_get_kern_vm_start(arena);
+ user_vm_start = bpf_arena_get_user_vm_start(arena);
+
+ list = llist_del_all(&arena->free_spans);
+ llist_for_each(pos, list) {
+ s = llist_entry(pos, struct arena_free_span, node);
+ page_cnt = s->page_cnt;
+ kaddr = arena_vm_start + s->uaddr;
+ pgoff = compute_pgoff(arena, s->uaddr);
+
+ /* clear ptes and collect pages in free_pages llist */
+ apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
+ apply_range_clear_cb, &free_pages);
+
+ range_tree_set(&arena->rt, pgoff, page_cnt);
+ }
+ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
+
+ /* Iterate the list again without holding spinlock to do the tlb flush and zap_pages */
+ llist_for_each_safe(pos, t, list) {
+ s = llist_entry(pos, struct arena_free_span, node);
+ page_cnt = s->page_cnt;
+ full_uaddr = clear_lo32(user_vm_start) + s->uaddr;
+ kaddr = arena_vm_start + s->uaddr;
+
+ /* ensure no stale TLB entries */
+ flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE));
+
+ /* remove pages from user vmas */
+ zap_pages(arena, full_uaddr, page_cnt);
+
+ kfree_nolock(s);
+ }
+
+ /* free all pages collected by apply_to_existing_page_range() in the first loop */
+ llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) {
+ page = llist_entry(pos, struct page, pcp_llist);
+ __free_page(page);
+ }
+
+ bpf_map_memcg_exit(old_memcg, new_memcg);
+}
+
+static void arena_free_irq(struct irq_work *iw)
+{
+ struct bpf_arena *arena = container_of(iw, struct bpf_arena, free_irq);
+
+ schedule_work(&arena->free_work);
}
__bpf_kfunc_start_defs();
@@ -589,9 +863,20 @@ __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_
if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
return NULL;
- return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id);
+ return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
}
+void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
+ int node_id, u64 flags)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
+ return NULL;
+
+ return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
+}
__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
{
struct bpf_map *map = p__map;
@@ -599,7 +884,17 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt
if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign)
return;
- arena_free_pages(arena, (long)ptr__ign, page_cnt);
+ arena_free_pages(arena, (long)ptr__ign, page_cnt, true);
+}
+
+void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign)
+ return;
+ arena_free_pages(arena, (long)ptr__ign, page_cnt, false);
}
__bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt)
@@ -618,9 +913,9 @@ __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_c
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(arena_kfuncs)
-BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2)
-BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
-BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_ARENA_RET | KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_ARENA_ARG2)
BTF_KFUNCS_END(arena_kfuncs)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1eeb31c5b317..67e9e811de3a 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -307,7 +307,7 @@ static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key,
return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu);
}
-int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
+int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
@@ -325,11 +325,18 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
size = array->elem_size;
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
+ if (map_flags & BPF_F_CPU) {
+ cpu = map_flags >> 32;
+ copy_map_value(map, value, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, value);
+ goto unlock;
+ }
for_each_possible_cpu(cpu) {
copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
check_and_init_map_value(map, value + off);
off += size;
}
+unlock:
rcu_read_unlock();
return 0;
}
@@ -398,10 +405,11 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
void __percpu *pptr;
- int cpu, off = 0;
+ void *ptr, *val;
u32 size;
+ int cpu;
- if (unlikely(map_flags > BPF_EXIST))
+ if (unlikely((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS))
/* unknown flags */
return -EINVAL;
@@ -422,11 +430,20 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
size = array->elem_size;
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
+ if (map_flags & BPF_F_CPU) {
+ cpu = map_flags >> 32;
+ ptr = per_cpu_ptr(pptr, cpu);
+ copy_map_value(map, ptr, value);
+ bpf_obj_free_fields(array->map.record, ptr);
+ goto unlock;
+ }
for_each_possible_cpu(cpu) {
- copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off);
- bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu));
- off += size;
+ ptr = per_cpu_ptr(pptr, cpu);
+ val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
+ copy_map_value(map, ptr, val);
+ bpf_obj_free_fields(array->map.record, ptr);
}
+unlock:
rcu_read_unlock();
return 0;
}
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 0687a760974a..c2a2ead1f466 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -11,29 +11,6 @@
DEFINE_BPF_STORAGE_CACHE(cgroup_cache);
-static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy);
-
-static void bpf_cgrp_storage_lock(void)
-{
- cant_migrate();
- this_cpu_inc(bpf_cgrp_storage_busy);
-}
-
-static void bpf_cgrp_storage_unlock(void)
-{
- this_cpu_dec(bpf_cgrp_storage_busy);
-}
-
-static bool bpf_cgrp_storage_trylock(void)
-{
- cant_migrate();
- if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) {
- this_cpu_dec(bpf_cgrp_storage_busy);
- return false;
- }
- return true;
-}
-
static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)
{
struct cgroup *cg = owner;
@@ -45,16 +22,14 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
{
struct bpf_local_storage *local_storage;
- rcu_read_lock_dont_migrate();
+ rcu_read_lock();
local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
if (!local_storage)
goto out;
- bpf_cgrp_storage_lock();
bpf_local_storage_destroy(local_storage);
- bpf_cgrp_storage_unlock();
out:
- rcu_read_unlock_migrate();
+ rcu_read_unlock();
}
static struct bpf_local_storage_data *
@@ -83,9 +58,7 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
if (IS_ERR(cgroup))
return ERR_CAST(cgroup);
- bpf_cgrp_storage_lock();
sdata = cgroup_storage_lookup(cgroup, map, true);
- bpf_cgrp_storage_unlock();
cgroup_put(cgroup);
return sdata ? sdata->data : NULL;
}
@@ -102,10 +75,8 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
if (IS_ERR(cgroup))
return PTR_ERR(cgroup);
- bpf_cgrp_storage_lock();
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
value, map_flags, false, GFP_ATOMIC);
- bpf_cgrp_storage_unlock();
cgroup_put(cgroup);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -118,8 +89,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- bpf_selem_unlink(SELEM(sdata), false);
- return 0;
+ return bpf_selem_unlink(SELEM(sdata));
}
static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
@@ -132,9 +102,7 @@ static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
if (IS_ERR(cgroup))
return PTR_ERR(cgroup);
- bpf_cgrp_storage_lock();
err = cgroup_storage_delete(cgroup, map);
- bpf_cgrp_storage_unlock();
cgroup_put(cgroup);
return err;
}
@@ -151,7 +119,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
static void cgroup_storage_map_free(struct bpf_map *map)
{
- bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy);
+ bpf_local_storage_map_free(map, &cgroup_cache);
}
/* *gfp_flags* is a hidden argument provided by the verifier */
@@ -159,7 +127,6 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
void *, value, u64, flags, gfp_t, gfp_flags)
{
struct bpf_local_storage_data *sdata;
- bool nobusy;
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
@@ -168,38 +135,27 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
if (!cgroup)
return (unsigned long)NULL;
- nobusy = bpf_cgrp_storage_trylock();
-
- sdata = cgroup_storage_lookup(cgroup, map, nobusy);
+ sdata = cgroup_storage_lookup(cgroup, map, true);
if (sdata)
- goto unlock;
+ goto out;
/* only allocate new storage, when the cgroup is refcounted */
if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
- (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy)
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
value, BPF_NOEXIST, false, gfp_flags);
-unlock:
- if (nobusy)
- bpf_cgrp_storage_unlock();
+out:
return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
}
BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup)
{
- int ret;
-
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!cgroup)
return -EINVAL;
- if (!bpf_cgrp_storage_trylock())
- return -EBUSY;
-
- ret = cgroup_storage_delete(cgroup, map);
- bpf_cgrp_storage_unlock();
- return ret;
+ return cgroup_storage_delete(cgroup, map);
}
const struct bpf_map_ops cgrp_storage_map_ops = {
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index e54cce2b9175..e86734609f3d 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -110,9 +110,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- bpf_selem_unlink(SELEM(sdata), false);
-
- return 0;
+ return bpf_selem_unlink(SELEM(sdata));
}
static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
@@ -186,7 +184,7 @@ static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
static void inode_storage_map_free(struct bpf_map *map)
{
- bpf_local_storage_map_free(map, &inode_cache, NULL);
+ bpf_local_storage_map_free(map, &inode_cache);
}
const struct bpf_map_ops inode_storage_map_ops = {
diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c
index c96630cb75bf..c0286f25ca3c 100644
--- a/kernel/bpf/bpf_insn_array.c
+++ b/kernel/bpf/bpf_insn_array.c
@@ -123,10 +123,10 @@ static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
if ((off % sizeof(long)) != 0 ||
(off / sizeof(long)) >= map->max_entries)
- return -EINVAL;
+ return -EACCES;
/* from BPF's point of view, this map is a jump table */
- *imm = (unsigned long)insn_array->ips + off;
+ *imm = (unsigned long)insn_array->ips;
return 0;
}
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index eec60b57bd3d..4b58d56ecab1 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -86,7 +86,7 @@ static bool bpf_iter_support_resched(struct seq_file *seq)
/* bpf_seq_read, a customized and simpler version for bpf iterator.
* The following are differences from seq_read():
- * . fixed buffer size (PAGE_SIZE)
+ * . fixed buffer size (PAGE_SIZE << 3)
* . assuming NULL ->llseek()
* . stop() may call bpf program, handling potential overflow there
*/
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index e2fe6c32822b..b28f07d3a0db 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -19,9 +19,9 @@
static struct bpf_local_storage_map_bucket *
select_bucket(struct bpf_local_storage_map *smap,
- struct bpf_local_storage_elem *selem)
+ struct bpf_local_storage *local_storage)
{
- return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
+ return &smap->buckets[hash_ptr(local_storage, smap->bucket_log)];
}
static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
@@ -61,11 +61,6 @@ static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
return !hlist_unhashed(&selem->snode);
}
-static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem)
-{
- return !hlist_unhashed_lockless(&selem->map_node);
-}
-
static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
{
return !hlist_unhashed(&selem->map_node);
@@ -90,6 +85,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
if (selem) {
RCU_INIT_POINTER(SDATA(selem)->smap, smap);
+ atomic_set(&selem->state, 0);
+ selem->use_kmalloc_nolock = smap->use_kmalloc_nolock;
if (value) {
/* No need to call check_and_init_map_value as memory is zero init */
@@ -198,9 +195,11 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
/* The bpf_local_storage_map_free will wait for rcu_barrier */
smap = rcu_dereference_check(SDATA(selem)->smap, 1);
- migrate_disable();
- bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
- migrate_enable();
+ if (smap) {
+ migrate_disable();
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ migrate_enable();
+ }
kfree_nolock(selem);
}
@@ -219,13 +218,14 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
- if (!smap->use_kmalloc_nolock) {
+ if (!selem->use_kmalloc_nolock) {
/*
* No uptr will be unpin even when reuse_now == false since uptr
* is only supported in task local storage, where
* smap->use_kmalloc_nolock == true.
*/
- bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ if (smap)
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
__bpf_selem_free(selem, reuse_now);
return;
}
@@ -256,6 +256,36 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
bpf_selem_free(selem, reuse_now);
}
+static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem,
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage *local_storage,
+ bool free_local_storage, bool pin_owner)
+{
+ void *owner = local_storage->owner;
+ u32 uncharge = smap->elem_size;
+
+ if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
+ SDATA(selem))
+ RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
+
+ if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt))
+ return;
+
+ uncharge += free_local_storage ? sizeof(*local_storage) : 0;
+ mem_uncharge(smap, local_storage->owner, uncharge);
+ local_storage->mem_charge -= uncharge;
+
+ if (free_local_storage) {
+ local_storage->owner = NULL;
+
+ /* After this RCU_INIT, owner may be freed and cannot be used */
+ RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
+ }
+
+ if (pin_owner)
+ refcount_dec(&local_storage->owner_refcnt);
+}
+
/* local_storage->lock must be held and selem->local_storage == local_storage.
* The caller must ensure selem->smap is still valid to be
* dereferenced for its smap->elem_size and smap->cache_idx.
@@ -266,124 +296,219 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
- void *owner;
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
- owner = local_storage->owner;
-
- /* All uncharging on the owner must be done first.
- * The owner may be freed once the last selem is unlinked
- * from local_storage.
- */
- mem_uncharge(smap, owner, smap->elem_size);
free_local_storage = hlist_is_singular_node(&selem->snode,
&local_storage->list);
- if (free_local_storage) {
- mem_uncharge(smap, owner, sizeof(struct bpf_local_storage));
- local_storage->owner = NULL;
- /* After this RCU_INIT, owner may be freed and cannot be used */
- RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
+ bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage,
+ free_local_storage, false);
- /* local_storage is not freed now. local_storage->lock is
- * still held and raw_spin_unlock_bh(&local_storage->lock)
- * will be done by the caller.
- *
- * Although the unlock will be done under
- * rcu_read_lock(), it is more intuitive to
- * read if the freeing of the storage is done
- * after the raw_spin_unlock_bh(&local_storage->lock).
- *
- * Hence, a "bool free_local_storage" is returned
- * to the caller which then calls then frees the storage after
- * all the RCU grace periods have expired.
- */
- }
hlist_del_init_rcu(&selem->snode);
- if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
- SDATA(selem))
- RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
hlist_add_head(&selem->free_node, free_selem_list);
- if (rcu_access_pointer(local_storage->smap) == smap)
- RCU_INIT_POINTER(local_storage->smap, NULL);
-
return free_local_storage;
}
-static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
- bool reuse_now)
+void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage_map *smap;
+
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+ local_storage->mem_charge += smap->elem_size;
+
+ RCU_INIT_POINTER(selem->local_storage, local_storage);
+ hlist_add_head_rcu(&selem->snode, &local_storage->list);
+}
+
+static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage *local_storage;
+ struct bpf_local_storage_map *smap;
+ struct bpf_local_storage_map_bucket *b;
+ unsigned long flags;
+ int err;
+
+ local_storage = rcu_dereference_check(selem->local_storage,
+ bpf_rcu_lock_held());
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+ b = select_bucket(smap, local_storage);
+ err = raw_res_spin_lock_irqsave(&b->lock, flags);
+ if (err)
+ return err;
+
+ hlist_del_init_rcu(&selem->map_node);
+ raw_res_spin_unlock_irqrestore(&b->lock, flags);
+
+ return 0;
+}
+
+static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem)
+{
+ hlist_del_init_rcu(&selem->map_node);
+}
+
+int bpf_selem_link_map(struct bpf_local_storage_map *smap,
+ struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage_map_bucket *b;
+ unsigned long flags;
+ int err;
+
+ b = select_bucket(smap, local_storage);
+
+ err = raw_res_spin_lock_irqsave(&b->lock, flags);
+ if (err)
+ return err;
+
+ hlist_add_head_rcu(&selem->map_node, &b->list);
+ raw_res_spin_unlock_irqrestore(&b->lock, flags);
+
+ return 0;
+}
+
+static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b,
+ struct bpf_local_storage_elem *selem)
+{
+ hlist_add_head_rcu(&selem->map_node, &b->list);
+}
+
+/*
+ * Unlink an selem from map and local storage with lock held.
+ * This is the common path used by local storages to delete an selem.
+ */
+int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage *local_storage;
bool free_local_storage = false;
HLIST_HEAD(selem_free_list);
unsigned long flags;
+ int err;
if (unlikely(!selem_linked_to_storage_lockless(selem)))
/* selem has already been unlinked from sk */
- return;
+ return 0;
local_storage = rcu_dereference_check(selem->local_storage,
bpf_rcu_lock_held());
- raw_spin_lock_irqsave(&local_storage->lock, flags);
- if (likely(selem_linked_to_storage(selem)))
+ err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
+ if (err)
+ return err;
+
+ if (likely(selem_linked_to_storage(selem))) {
+ /* Always unlink from map before unlinking from local_storage
+ * because selem will be freed after successfully unlinked from
+ * the local_storage.
+ */
+ err = bpf_selem_unlink_map(selem);
+ if (err)
+ goto out;
+
free_local_storage = bpf_selem_unlink_storage_nolock(
local_storage, selem, &selem_free_list);
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ }
+out:
+ raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
- bpf_selem_free_list(&selem_free_list, reuse_now);
+ bpf_selem_free_list(&selem_free_list, false);
if (free_local_storage)
- bpf_local_storage_free(local_storage, reuse_now);
-}
+ bpf_local_storage_free(local_storage, false);
-void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
- struct bpf_local_storage_elem *selem)
-{
- RCU_INIT_POINTER(selem->local_storage, local_storage);
- hlist_add_head_rcu(&selem->snode, &local_storage->list);
+ return err;
}
-static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+/*
+ * Unlink an selem from map and local storage with lockless fallback if callers
+ * are racing or rqspinlock returns error. It should only be called by
+ * bpf_local_storage_destroy() or bpf_local_storage_map_free().
+ */
+static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem,
+ struct bpf_local_storage_map_bucket *b)
{
+ bool in_map_free = !!b, free_storage = false;
+ struct bpf_local_storage *local_storage;
struct bpf_local_storage_map *smap;
- struct bpf_local_storage_map_bucket *b;
unsigned long flags;
+ int err, unlink = 0;
- if (unlikely(!selem_linked_to_map_lockless(selem)))
- /* selem has already be unlinked from smap */
- return;
-
+ local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held());
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
- b = select_bucket(smap, selem);
- raw_spin_lock_irqsave(&b->lock, flags);
- if (likely(selem_linked_to_map(selem)))
- hlist_del_init_rcu(&selem->map_node);
- raw_spin_unlock_irqrestore(&b->lock, flags);
-}
-void bpf_selem_link_map(struct bpf_local_storage_map *smap,
- struct bpf_local_storage_elem *selem)
-{
- struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem);
- unsigned long flags;
+ if (smap) {
+ b = b ? : select_bucket(smap, local_storage);
+ err = raw_res_spin_lock_irqsave(&b->lock, flags);
+ if (!err) {
+ /*
+ * Call bpf_obj_free_fields() under b->lock to make sure it is done
+ * exactly once for an selem. Safe to free special fields immediately
+ * as no BPF program should be referencing the selem.
+ */
+ if (likely(selem_linked_to_map(selem))) {
+ hlist_del_init_rcu(&selem->map_node);
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ unlink++;
+ }
+ raw_res_spin_unlock_irqrestore(&b->lock, flags);
+ }
+ /*
+ * Highly unlikely scenario: resource leak
+ *
+ * When map_free(selem1), destroy(selem1) and destroy(selem2) are racing
+ * and both selem belong to the same bucket, if destroy(selem2) acquired
+ * b->lock and block for too long, neither map_free(selem1) and
+ * destroy(selem1) will be able to free the special field associated
+ * with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT.
+ */
+ WARN_ON_ONCE(err && in_map_free);
+ if (!err || in_map_free)
+ RCU_INIT_POINTER(SDATA(selem)->smap, NULL);
+ }
- raw_spin_lock_irqsave(&b->lock, flags);
- hlist_add_head_rcu(&selem->map_node, &b->list);
- raw_spin_unlock_irqrestore(&b->lock, flags);
-}
+ if (local_storage) {
+ err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
+ if (!err) {
+ if (likely(selem_linked_to_storage(selem))) {
+ free_storage = hlist_is_singular_node(&selem->snode,
+ &local_storage->list);
+ /*
+ * Okay to skip clearing owner_storage and storage->owner in
+ * destroy() since the owner is going away. No user or bpf
+ * programs should be able to reference it.
+ */
+ if (smap && in_map_free)
+ bpf_selem_unlink_storage_nolock_misc(
+ selem, smap, local_storage,
+ free_storage, true);
+ hlist_del_init_rcu(&selem->snode);
+ unlink++;
+ }
+ raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
+ }
+ if (!err || !in_map_free)
+ RCU_INIT_POINTER(selem->local_storage, NULL);
+ }
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
-{
- /* Always unlink from map before unlinking from local_storage
- * because selem will be freed after successfully unlinked from
- * the local_storage.
+ if (unlink != 2)
+ atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state);
+
+ /*
+ * Normally, an selem can be unlinked under local_storage->lock and b->lock, and
+ * then freed after an RCU grace period. However, if destroy() and map_free() are
+ * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free
+ * the selem only after both map_free() and destroy() see the selem.
*/
- bpf_selem_unlink_map(selem);
- bpf_selem_unlink_storage(selem, reuse_now);
+ if (unlink == 2 ||
+ atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED)
+ bpf_selem_free(selem, true);
+
+ if (free_storage)
+ bpf_local_storage_free(local_storage, true);
}
void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
@@ -391,16 +516,20 @@ void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem)
{
unsigned long flags;
+ int err;
/* spinlock is needed to avoid racing with the
* parallel delete. Otherwise, publishing an already
* deleted sdata to the cache will become a use-after-free
* problem in the next bpf_local_storage_lookup().
*/
- raw_spin_lock_irqsave(&local_storage->lock, flags);
+ err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
+ if (err)
+ return;
+
if (selem_linked_to_storage(selem))
rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem));
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
}
static int check_flags(const struct bpf_local_storage_data *old_sdata,
@@ -424,6 +553,8 @@ int bpf_local_storage_alloc(void *owner,
{
struct bpf_local_storage *prev_storage, *storage;
struct bpf_local_storage **owner_storage_ptr;
+ struct bpf_local_storage_map_bucket *b;
+ unsigned long flags;
int err;
err = mem_charge(smap, owner, sizeof(*storage));
@@ -441,14 +572,21 @@ int bpf_local_storage_alloc(void *owner,
goto uncharge;
}
- RCU_INIT_POINTER(storage->smap, smap);
INIT_HLIST_HEAD(&storage->list);
- raw_spin_lock_init(&storage->lock);
+ raw_res_spin_lock_init(&storage->lock);
storage->owner = owner;
+ storage->mem_charge = sizeof(*storage);
storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
+ refcount_set(&storage->owner_refcnt, 1);
bpf_selem_link_storage_nolock(storage, first_selem);
- bpf_selem_link_map(smap, first_selem);
+
+ b = select_bucket(smap, storage);
+ err = raw_res_spin_lock_irqsave(&b->lock, flags);
+ if (err)
+ goto uncharge;
+
+ bpf_selem_link_map_nolock(b, first_selem);
owner_storage_ptr =
(struct bpf_local_storage **)owner_storage(smap, owner);
@@ -464,10 +602,12 @@ int bpf_local_storage_alloc(void *owner,
*/
prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
if (unlikely(prev_storage)) {
- bpf_selem_unlink_map(first_selem);
+ bpf_selem_unlink_map_nolock(first_selem);
+ raw_res_spin_unlock_irqrestore(&b->lock, flags);
err = -EAGAIN;
goto uncharge;
}
+ raw_res_spin_unlock_irqrestore(&b->lock, flags);
return 0;
@@ -489,8 +629,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
struct bpf_local_storage_data *old_sdata = NULL;
struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
struct bpf_local_storage *local_storage;
+ struct bpf_local_storage_map_bucket *b;
HLIST_HEAD(old_selem_free_list);
- unsigned long flags;
+ unsigned long flags, b_flags;
int err;
/* BPF_EXIST and BPF_NOEXIST cannot be both set */
@@ -549,7 +690,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (!alloc_selem)
return ERR_PTR(-ENOMEM);
- raw_spin_lock_irqsave(&local_storage->lock, flags);
+ err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
+ if (err)
+ goto free_selem;
/* Recheck local_storage->list under local_storage->lock */
if (unlikely(hlist_empty(&local_storage->list))) {
@@ -574,22 +717,30 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
goto unlock;
}
+ b = select_bucket(smap, local_storage);
+
+ err = raw_res_spin_lock_irqsave(&b->lock, b_flags);
+ if (err)
+ goto unlock;
+
alloc_selem = NULL;
/* First, link the new selem to the map */
- bpf_selem_link_map(smap, selem);
+ bpf_selem_link_map_nolock(b, selem);
/* Second, link (and publish) the new selem to local_storage */
bpf_selem_link_storage_nolock(local_storage, selem);
/* Third, remove old selem, SELEM(old_sdata) */
if (old_sdata) {
- bpf_selem_unlink_map(SELEM(old_sdata));
+ bpf_selem_unlink_map_nolock(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
&old_selem_free_list);
}
+ raw_res_spin_unlock_irqrestore(&b->lock, b_flags);
unlock:
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
+free_selem:
bpf_selem_free_list(&old_selem_free_list, false);
if (alloc_selem) {
mem_uncharge(smap, owner, smap->elem_size);
@@ -657,13 +808,13 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
return 0;
}
-void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
+/*
+ * Destroy local storage when the owner is going away. Caller must uncharge memory
+ * if memory charging is used.
+ */
+u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
{
struct bpf_local_storage_elem *selem;
- bool free_storage = false;
- HLIST_HEAD(free_selem_list);
- struct hlist_node *n;
- unsigned long flags;
/* Neither the bpf_prog nor the bpf_map's syscall
* could be modifying the local_storage->list now.
@@ -674,27 +825,20 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
* when unlinking elem from the local_storage->list and
* the map's bucket->list.
*/
- raw_spin_lock_irqsave(&local_storage->lock, flags);
- hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
- /* Always unlink from map before unlinking from
- * local_storage.
- */
- bpf_selem_unlink_map(selem);
- /* If local_storage list has only one element, the
- * bpf_selem_unlink_storage_nolock() will return true.
- * Otherwise, it will return false. The current loop iteration
- * intends to remove all local storage. So the last iteration
- * of the loop will set the free_cgroup_storage to true.
+ hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
+ bpf_selem_unlink_nofail(selem, NULL);
+
+ if (!refcount_dec_and_test(&local_storage->owner_refcnt)) {
+ while (refcount_read(&local_storage->owner_refcnt))
+ cpu_relax();
+ /*
+ * Paired with refcount_dec() in bpf_selem_unlink_nofail()
+ * to make sure destroy() sees the correct local_storage->mem_charge.
*/
- free_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, &free_selem_list);
+ smp_mb();
}
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
-
- bpf_selem_free_list(&free_selem_list, true);
- if (free_storage)
- bpf_local_storage_free(local_storage, true);
+ return local_storage->mem_charge;
}
u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
@@ -736,7 +880,7 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
for (i = 0; i < nbuckets; i++) {
INIT_HLIST_HEAD(&smap->buckets[i].list);
- raw_spin_lock_init(&smap->buckets[i].lock);
+ raw_res_spin_lock_init(&smap->buckets[i].lock);
}
smap->elem_size = offsetof(struct bpf_local_storage_elem,
@@ -758,8 +902,7 @@ free_smap:
}
void bpf_local_storage_map_free(struct bpf_map *map,
- struct bpf_local_storage_cache *cache,
- int __percpu *busy_counter)
+ struct bpf_local_storage_cache *cache)
{
struct bpf_local_storage_map_bucket *b;
struct bpf_local_storage_elem *selem;
@@ -789,15 +932,14 @@ void bpf_local_storage_map_free(struct bpf_map *map,
rcu_read_lock();
/* No one is adding to b->list now */
- while ((selem = hlist_entry_safe(
- rcu_dereference_raw(hlist_first_rcu(&b->list)),
- struct bpf_local_storage_elem, map_node))) {
- if (busy_counter)
- this_cpu_inc(*busy_counter);
- bpf_selem_unlink(selem, true);
- if (busy_counter)
- this_cpu_dec(*busy_counter);
- cond_resched_rcu();
+restart:
+ hlist_for_each_entry_rcu(selem, &b->list, map_node) {
+ bpf_selem_unlink_nofail(selem, b);
+
+ if (need_resched()) {
+ cond_resched_rcu();
+ goto restart;
+ }
}
rcu_read_unlock();
}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 7cb6e8d4282c..0c4a0c8e6f70 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -18,10 +18,11 @@
#include <linux/bpf-cgroup.h>
/* For every LSM hook that allows attachment of BPF programs, declare a nop
- * function where a BPF program can be attached.
+ * function where a BPF program can be attached. Notably, we qualify each with
+ * weak linkage such that strong overrides can be implemented if need be.
*/
#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
-noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
+__weak noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
{ \
return DEFAULT; \
}
diff --git a/kernel/bpf/bpf_lsm_proto.c b/kernel/bpf/bpf_lsm_proto.c
new file mode 100644
index 000000000000..44a54fd8045e
--- /dev/null
+++ b/kernel/bpf/bpf_lsm_proto.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2025 Google LLC.
+ */
+
+#include <linux/fs.h>
+#include <linux/bpf_lsm.h>
+
+/*
+ * Strong definition of the mmap_file() BPF LSM hook. The __nullable suffix on
+ * the struct file pointer parameter name marks it as PTR_MAYBE_NULL. This
+ * explicitly enforces that BPF LSM programs check for NULL before attempting to
+ * dereference it.
+ */
+int bpf_lsm_mmap_file(struct file *file__nullable, unsigned long reqprot,
+ unsigned long prot, unsigned long flags)
+{
+ return 0;
+}
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 278490683d28..c43346cb3d76 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -533,6 +533,17 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
}
}
+static void bpf_struct_ops_map_dissoc_progs(struct bpf_struct_ops_map *st_map)
+{
+ u32 i;
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->links[i])
+ break;
+ bpf_prog_disassoc_struct_ops(st_map->links[i]->prog);
+ }
+}
+
static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map)
{
int i;
@@ -801,6 +812,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto reset_unlock;
}
+ /* Poison pointer on error instead of return for backward compatibility */
+ bpf_prog_assoc_struct_ops(prog, &st_map->map);
+
link = kzalloc(sizeof(*link), GFP_USER);
if (!link) {
bpf_prog_put(prog);
@@ -980,6 +994,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
if (btf_is_module(st_map->btf))
module_put(st_map->st_ops_desc->st_ops->owner);
+ bpf_struct_ops_map_dissoc_progs(st_map);
+
bpf_struct_ops_map_del_ksyms(st_map);
/* The struct_ops's function may switch to another struct_ops.
@@ -1396,6 +1412,78 @@ err_out:
return err;
}
+int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map)
+{
+ struct bpf_map *st_ops_assoc;
+
+ guard(mutex)(&prog->aux->st_ops_assoc_mutex);
+
+ st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc,
+ lockdep_is_held(&prog->aux->st_ops_assoc_mutex));
+ if (st_ops_assoc && st_ops_assoc == map)
+ return 0;
+
+ if (st_ops_assoc) {
+ if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
+ return -EBUSY;
+
+ rcu_assign_pointer(prog->aux->st_ops_assoc, BPF_PTR_POISON);
+ } else {
+ /*
+ * struct_ops map does not track associated non-struct_ops programs.
+ * Bump the refcount to make sure st_ops_assoc is always valid.
+ */
+ if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
+ bpf_map_inc(map);
+
+ rcu_assign_pointer(prog->aux->st_ops_assoc, map);
+ }
+
+ return 0;
+}
+
+void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog)
+{
+ struct bpf_map *st_ops_assoc;
+
+ guard(mutex)(&prog->aux->st_ops_assoc_mutex);
+
+ st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc,
+ lockdep_is_held(&prog->aux->st_ops_assoc_mutex));
+ if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON)
+ return;
+
+ if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
+ bpf_map_put(st_ops_assoc);
+
+ RCU_INIT_POINTER(prog->aux->st_ops_assoc, NULL);
+}
+
+/*
+ * Get a reference to the struct_ops struct (i.e., kdata) associated with a
+ * program. Should only be called in BPF program context (e.g., in a kfunc).
+ *
+ * If the returned pointer is not NULL, it must points to a valid struct_ops.
+ * The struct_ops map is not guaranteed to be initialized nor attached.
+ * Kernel struct_ops implementers are responsible for tracking and checking
+ * the state of the struct_ops if the use case requires an initialized or
+ * attached struct_ops.
+ */
+void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux)
+{
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *st_ops_assoc;
+
+ st_ops_assoc = rcu_dereference_check(aux->st_ops_assoc, bpf_rcu_lock_held());
+ if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON)
+ return NULL;
+
+ st_map = (struct bpf_struct_ops_map *)st_ops_assoc;
+
+ return &st_map->kvalue.data;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get_assoc_struct_ops);
+
void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index a1dc1bf0848a..605506792b5b 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -20,29 +20,6 @@
DEFINE_BPF_STORAGE_CACHE(task_cache);
-static DEFINE_PER_CPU(int, bpf_task_storage_busy);
-
-static void bpf_task_storage_lock(void)
-{
- cant_migrate();
- this_cpu_inc(bpf_task_storage_busy);
-}
-
-static void bpf_task_storage_unlock(void)
-{
- this_cpu_dec(bpf_task_storage_busy);
-}
-
-static bool bpf_task_storage_trylock(void)
-{
- cant_migrate();
- if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
- this_cpu_dec(bpf_task_storage_busy);
- return false;
- }
- return true;
-}
-
static struct bpf_local_storage __rcu **task_storage_ptr(void *owner)
{
struct task_struct *task = owner;
@@ -70,17 +47,15 @@ void bpf_task_storage_free(struct task_struct *task)
{
struct bpf_local_storage *local_storage;
- rcu_read_lock_dont_migrate();
+ rcu_read_lock();
local_storage = rcu_dereference(task->bpf_storage);
if (!local_storage)
goto out;
- bpf_task_storage_lock();
bpf_local_storage_destroy(local_storage);
- bpf_task_storage_unlock();
out:
- rcu_read_unlock_migrate();
+ rcu_read_unlock();
}
static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
@@ -106,9 +81,7 @@ static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
goto out;
}
- bpf_task_storage_lock();
sdata = task_storage_lookup(task, map, true);
- bpf_task_storage_unlock();
put_pid(pid);
return sdata ? sdata->data : NULL;
out:
@@ -143,11 +116,9 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
goto out;
}
- bpf_task_storage_lock();
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value, map_flags,
true, GFP_ATOMIC);
- bpf_task_storage_unlock();
err = PTR_ERR_OR_ZERO(sdata);
out:
@@ -155,8 +126,7 @@ out:
return err;
}
-static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
- bool nobusy)
+static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
{
struct bpf_local_storage_data *sdata;
@@ -164,12 +134,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
if (!sdata)
return -ENOENT;
- if (!nobusy)
- return -EBUSY;
-
- bpf_selem_unlink(SELEM(sdata), false);
-
- return 0;
+ return bpf_selem_unlink(SELEM(sdata));
}
static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
@@ -194,111 +159,50 @@ static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
goto out;
}
- bpf_task_storage_lock();
- err = task_storage_delete(task, map, true);
- bpf_task_storage_unlock();
+ err = task_storage_delete(task, map);
out:
put_pid(pid);
return err;
}
-/* Called by bpf_task_storage_get*() helpers */
-static void *__bpf_task_storage_get(struct bpf_map *map,
- struct task_struct *task, void *value,
- u64 flags, gfp_t gfp_flags, bool nobusy)
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
{
struct bpf_local_storage_data *sdata;
- sdata = task_storage_lookup(task, map, nobusy);
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
+ return (unsigned long)NULL;
+
+ sdata = task_storage_lookup(task, map, true);
if (sdata)
- return sdata->data;
+ return (unsigned long)sdata->data;
/* only allocate new storage, when the task is refcounted */
if (refcount_read(&task->usage) &&
- (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) {
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) {
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
BPF_NOEXIST, false, gfp_flags);
- return IS_ERR(sdata) ? NULL : sdata->data;
+ return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
}
- return NULL;
-}
-
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *,
- task, void *, value, u64, flags, gfp_t, gfp_flags)
-{
- bool nobusy;
- void *data;
-
- WARN_ON_ONCE(!bpf_rcu_lock_held());
- if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
- return (unsigned long)NULL;
-
- nobusy = bpf_task_storage_trylock();
- data = __bpf_task_storage_get(map, task, value, flags,
- gfp_flags, nobusy);
- if (nobusy)
- bpf_task_storage_unlock();
- return (unsigned long)data;
-}
-
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
- task, void *, value, u64, flags, gfp_t, gfp_flags)
-{
- void *data;
-
- WARN_ON_ONCE(!bpf_rcu_lock_held());
- if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
- return (unsigned long)NULL;
-
- bpf_task_storage_lock();
- data = __bpf_task_storage_get(map, task, value, flags,
- gfp_flags, true);
- bpf_task_storage_unlock();
- return (unsigned long)data;
-}
-
-BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *,
- task)
-{
- bool nobusy;
- int ret;
-
- WARN_ON_ONCE(!bpf_rcu_lock_held());
- if (!task)
- return -EINVAL;
-
- nobusy = bpf_task_storage_trylock();
- /* This helper must only be called from places where the lifetime of the task
- * is guaranteed. Either by being refcounted or by being protected
- * by an RCU read-side critical section.
- */
- ret = task_storage_delete(task, map, nobusy);
- if (nobusy)
- bpf_task_storage_unlock();
- return ret;
+ return (unsigned long)NULL;
}
BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
task)
{
- int ret;
-
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!task)
return -EINVAL;
- bpf_task_storage_lock();
/* This helper must only be called from places where the lifetime of the task
* is guaranteed. Either by being refcounted or by being protected
* by an RCU read-side critical section.
*/
- ret = task_storage_delete(task, map, true);
- bpf_task_storage_unlock();
- return ret;
+ return task_storage_delete(task, map);
}
static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -313,7 +217,7 @@ static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
static void task_storage_map_free(struct bpf_map *map)
{
- bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy);
+ bpf_local_storage_map_free(map, &task_cache);
}
BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map)
@@ -332,17 +236,6 @@ const struct bpf_map_ops task_storage_map_ops = {
.map_owner_storage_ptr = task_storage_ptr,
};
-const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
- .func = bpf_task_storage_get_recur,
- .gpl_only = false,
- .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
- .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
- .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
- .arg4_type = ARG_ANYTHING,
-};
-
const struct bpf_func_proto bpf_task_storage_get_proto = {
.func = bpf_task_storage_get,
.gpl_only = false,
@@ -354,15 +247,6 @@ const struct bpf_func_proto bpf_task_storage_get_proto = {
.arg4_type = ARG_ANYTHING,
};
-const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {
- .func = bpf_task_storage_delete_recur,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
- .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
-};
-
const struct bpf_func_proto bpf_task_storage_delete_proto = {
.func = bpf_task_storage_delete,
.gpl_only = false,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 0de8fc8a0e0b..7708958e3fb8 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -25,6 +25,7 @@
#include <linux/perf_event.h>
#include <linux/bsearch.h>
#include <linux/kobject.h>
+#include <linux/string.h>
#include <linux/sysfs.h>
#include <linux/overflow.h>
@@ -259,6 +260,7 @@ struct btf {
void *nohdr_data;
struct btf_header hdr;
u32 nr_types; /* includes VOID for base BTF */
+ u32 named_start_id;
u32 types_size;
u32 data_size;
refcount_t refcnt;
@@ -494,6 +496,11 @@ static bool btf_type_is_modifier(const struct btf_type *t)
return false;
}
+static int btf_start_id(const struct btf *btf)
+{
+ return btf->start_id + (btf->base_btf ? 0 : 1);
+}
+
bool btf_type_is_void(const struct btf_type *t)
{
return t == &btf_void;
@@ -544,21 +551,125 @@ u32 btf_nr_types(const struct btf *btf)
return total;
}
+/*
+ * Note that vmlinux and kernel module BTFs are always sorted
+ * during the building phase.
+ */
+static void btf_check_sorted(struct btf *btf)
+{
+ u32 i, n, named_start_id = 0;
+
+ n = btf_nr_types(btf);
+ if (btf_is_vmlinux(btf)) {
+ for (i = btf_start_id(btf); i < n; i++) {
+ const struct btf_type *t = btf_type_by_id(btf, i);
+ const char *n = btf_name_by_offset(btf, t->name_off);
+
+ if (n[0] != '\0') {
+ btf->named_start_id = i;
+ return;
+ }
+ }
+ return;
+ }
+
+ for (i = btf_start_id(btf) + 1; i < n; i++) {
+ const struct btf_type *ta = btf_type_by_id(btf, i - 1);
+ const struct btf_type *tb = btf_type_by_id(btf, i);
+ const char *na = btf_name_by_offset(btf, ta->name_off);
+ const char *nb = btf_name_by_offset(btf, tb->name_off);
+
+ if (strcmp(na, nb) > 0)
+ return;
+
+ if (named_start_id == 0 && na[0] != '\0')
+ named_start_id = i - 1;
+ if (named_start_id == 0 && nb[0] != '\0')
+ named_start_id = i;
+ }
+
+ if (named_start_id)
+ btf->named_start_id = named_start_id;
+}
+
+/*
+ * btf_named_start_id - Get the named starting ID for the BTF
+ * @btf: Pointer to the target BTF object
+ * @own: Flag indicating whether to query only the current BTF (true = current BTF only,
+ * false = recursively traverse the base BTF chain)
+ *
+ * Return value rules:
+ * 1. For a sorted btf, return its named_start_id
+ * 2. Else for a split BTF, return its start_id
+ * 3. Else for a base BTF, return 1
+ */
+u32 btf_named_start_id(const struct btf *btf, bool own)
+{
+ const struct btf *base_btf = btf;
+
+ while (!own && base_btf->base_btf)
+ base_btf = base_btf->base_btf;
+
+ return base_btf->named_start_id ?: (base_btf->start_id ?: 1);
+}
+
+static s32 btf_find_by_name_kind_bsearch(const struct btf *btf, const char *name)
+{
+ const struct btf_type *t;
+ const char *tname;
+ s32 l, r, m;
+
+ l = btf_named_start_id(btf, true);
+ r = btf_nr_types(btf) - 1;
+ while (l <= r) {
+ m = l + (r - l) / 2;
+ t = btf_type_by_id(btf, m);
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (strcmp(tname, name) >= 0) {
+ if (l == r)
+ return r;
+ r = m;
+ } else {
+ l = m + 1;
+ }
+ }
+
+ return btf_nr_types(btf);
+}
+
s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
{
+ const struct btf *base_btf = btf_base_btf(btf);
const struct btf_type *t;
const char *tname;
- u32 i, total;
+ s32 id, total;
- total = btf_nr_types(btf);
- for (i = 1; i < total; i++) {
- t = btf_type_by_id(btf, i);
- if (BTF_INFO_KIND(t->info) != kind)
- continue;
+ if (base_btf) {
+ id = btf_find_by_name_kind(base_btf, name, kind);
+ if (id > 0)
+ return id;
+ }
- tname = btf_name_by_offset(btf, t->name_off);
- if (!strcmp(tname, name))
- return i;
+ total = btf_nr_types(btf);
+ if (btf->named_start_id > 0 && name[0]) {
+ id = btf_find_by_name_kind_bsearch(btf, name);
+ for (; id < total; id++) {
+ t = btf_type_by_id(btf, id);
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (strcmp(tname, name) != 0)
+ return -ENOENT;
+ if (BTF_INFO_KIND(t->info) == kind)
+ return id;
+ }
+ } else {
+ for (id = btf_start_id(btf); id < total; id++) {
+ t = btf_type_by_id(btf, id);
+ if (BTF_INFO_KIND(t->info) != kind)
+ continue;
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (strcmp(tname, name) == 0)
+ return id;
+ }
}
return -ENOENT;
@@ -3424,7 +3535,8 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type
const struct btf_type *t;
int len, id;
- id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, 0);
+ id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key,
+ btf_named_start_id(btf, false) - 1);
if (id < 0)
return ERR_PTR(id);
@@ -5791,6 +5903,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
goto errout;
}
env->btf = btf;
+ btf->named_start_id = 0;
data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN);
if (!data) {
@@ -6107,6 +6220,7 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
case BPF_MODIFY_RETURN:
+ case BPF_TRACE_FSESSION:
/* allow u64* as ctx */
if (btf_is_int(t) && t->size == 8)
return 0;
@@ -6210,7 +6324,8 @@ static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name
btf->data = data;
btf->data_size = data_size;
btf->kernel_btf = true;
- snprintf(btf->name, sizeof(btf->name), "%s", name);
+ btf->named_start_id = 0;
+ strscpy(btf->name, name);
err = btf_parse_hdr(env);
if (err)
@@ -6230,6 +6345,7 @@ static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name
if (err)
goto errout;
+ btf_check_sorted(btf);
refcount_set(&btf->refcnt, 1);
return btf;
@@ -6327,7 +6443,8 @@ static struct btf *btf_parse_module(const char *module_name, const void *data,
btf->start_id = base_btf->nr_types;
btf->start_str_off = base_btf->hdr.str_len;
btf->kernel_btf = true;
- snprintf(btf->name, sizeof(btf->name), "%s", module_name);
+ btf->named_start_id = 0;
+ strscpy(btf->name, module_name);
btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN);
if (!btf->data) {
@@ -6363,6 +6480,7 @@ static struct btf *btf_parse_module(const char *module_name, const void *data,
}
btf_verifier_env_free(env);
+ btf_check_sorted(btf);
refcount_set(&btf->refcnt, 1);
return btf;
@@ -6704,6 +6822,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
fallthrough;
case BPF_LSM_CGROUP:
case BPF_TRACE_FEXIT:
+ case BPF_TRACE_FSESSION:
/* When LSM programs are attached to void LSM hooks
* they use FEXIT trampolines and when attached to
* int LSM hooks, they use MODIFY_RETURN trampolines.
@@ -7729,12 +7848,13 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
tname);
return -EINVAL;
}
+
/* Convert BTF function arguments into verifier types.
* Only PTR_TO_CTX and SCALAR are supported atm.
*/
for (i = 0; i < nargs; i++) {
u32 tags = 0;
- int id = 0;
+ int id = btf_named_start_id(btf, false) - 1;
/* 'arg:<tag>' decl_tag takes precedence over derivation of
* register type from BTF type itself
@@ -8640,24 +8760,17 @@ end:
return ret;
}
-static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
- enum btf_kfunc_hook hook,
- u32 kfunc_btf_id,
- const struct bpf_prog *prog)
+static u32 *btf_kfunc_id_set_contains(const struct btf *btf,
+ enum btf_kfunc_hook hook,
+ u32 kfunc_btf_id)
{
- struct btf_kfunc_hook_filter *hook_filter;
struct btf_id_set8 *set;
- u32 *id, i;
+ u32 *id;
if (hook >= BTF_KFUNC_HOOK_MAX)
return NULL;
if (!btf->kfunc_set_tab)
return NULL;
- hook_filter = &btf->kfunc_set_tab->hook_filters[hook];
- for (i = 0; i < hook_filter->nr_filters; i++) {
- if (hook_filter->filters[i](prog, kfunc_btf_id))
- return NULL;
- }
set = btf->kfunc_set_tab->sets[hook];
if (!set)
return NULL;
@@ -8668,6 +8781,28 @@ static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
return id + 1;
}
+static bool __btf_kfunc_is_allowed(const struct btf *btf,
+ enum btf_kfunc_hook hook,
+ u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
+{
+ struct btf_kfunc_hook_filter *hook_filter;
+ int i;
+
+ if (hook >= BTF_KFUNC_HOOK_MAX)
+ return false;
+ if (!btf->kfunc_set_tab)
+ return false;
+
+ hook_filter = &btf->kfunc_set_tab->hook_filters[hook];
+ for (i = 0; i < hook_filter->nr_filters; i++) {
+ if (hook_filter->filters[i](prog, kfunc_btf_id))
+ return false;
+ }
+
+ return true;
+}
+
static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
{
switch (prog_type) {
@@ -8681,6 +8816,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
return BTF_KFUNC_HOOK_STRUCT_OPS;
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_LSM:
return BTF_KFUNC_HOOK_TRACING;
@@ -8714,6 +8850,26 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
}
}
+bool btf_kfunc_is_allowed(const struct btf *btf,
+ u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
+{
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
+ enum btf_kfunc_hook hook;
+ u32 *kfunc_flags;
+
+ kfunc_flags = btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id);
+ if (kfunc_flags && __btf_kfunc_is_allowed(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog))
+ return true;
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ kfunc_flags = btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
+ if (kfunc_flags && __btf_kfunc_is_allowed(btf, hook, kfunc_btf_id, prog))
+ return true;
+
+ return false;
+}
+
/* Caution:
* Reference to the module (obtained using btf_try_get_module) corresponding to
* the struct btf *MUST* be held when calling this function from verifier
@@ -8721,26 +8877,27 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
* keeping the reference for the duration of the call provides the necessary
* protection for looking up a well-formed btf->kfunc_set_tab.
*/
-u32 *btf_kfunc_id_set_contains(const struct btf *btf,
- u32 kfunc_btf_id,
- const struct bpf_prog *prog)
+u32 *btf_kfunc_flags(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog)
{
enum bpf_prog_type prog_type = resolve_prog_type(prog);
enum btf_kfunc_hook hook;
u32 *kfunc_flags;
- kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog);
+ kfunc_flags = btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id);
if (kfunc_flags)
return kfunc_flags;
hook = bpf_prog_type_to_kfunc_hook(prog_type);
- return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id, prog);
+ return btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
}
u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id,
const struct bpf_prog *prog)
{
- return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog);
+ if (!__btf_kfunc_is_allowed(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog))
+ return NULL;
+
+ return btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id);
}
static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
@@ -8845,6 +9002,13 @@ static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc
*/
if (!t || !btf_type_is_ptr(t))
return -EINVAL;
+
+ if (IS_ENABLED(CONFIG_CFI_CLANG)) {
+ /* Ensure the destructor kfunc type matches btf_dtor_kfunc_t */
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_void(t))
+ return -EINVAL;
+ }
}
return 0;
}
@@ -9215,7 +9379,7 @@ bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
}
/* Attempt to find target candidates in vmlinux BTF first */
- cands = bpf_core_add_cands(cands, main_btf, 1);
+ cands = bpf_core_add_cands(cands, main_btf, btf_named_start_id(main_btf, true));
if (IS_ERR(cands))
return ERR_CAST(cands);
@@ -9247,7 +9411,7 @@ check_modules:
*/
btf_get(mod_btf);
spin_unlock_bh(&btf_idr_lock);
- cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
+ cands = bpf_core_add_cands(cands, mod_btf, btf_named_start_id(mod_btf, true));
btf_put(mod_btf);
if (IS_ERR(cands))
return ERR_CAST(cands);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 69988af44b37..b029f0369ecf 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1680,11 +1680,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
struct cgroup *cgrp;
int ret;
- /* Check socket family since not all sockets represent network
- * endpoint (e.g. AF_UNIX).
- */
- if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
- sk->sk_family != AF_UNIX)
+ if (!sk_is_inet(sk) && !sk_is_unix(sk))
return 0;
if (!ctx.uaddr) {
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index f04a468cf6a7..fd51fe3d92cc 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -8,12 +8,13 @@
#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */
-/* cgroup_iter provides four modes of traversal to the cgroup hierarchy.
+/* cgroup_iter provides five modes of traversal to the cgroup hierarchy.
*
* 1. Walk the descendants of a cgroup in pre-order.
* 2. Walk the descendants of a cgroup in post-order.
* 3. Walk the ancestors of a cgroup.
* 4. Show the given cgroup only.
+ * 5. Walk the children of a given parent cgroup.
*
* For walking descendants, cgroup_iter can walk in either pre-order or
* post-order. For walking ancestors, the iter walks up from a cgroup to
@@ -78,6 +79,8 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
return css_next_descendant_pre(NULL, p->start_css);
else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
return css_next_descendant_post(NULL, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_CHILDREN)
+ return css_next_child(NULL, p->start_css);
else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */
return p->start_css;
}
@@ -113,6 +116,8 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return css_next_descendant_post(curr, p->start_css);
else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
return curr->parent;
+ else if (p->order == BPF_CGROUP_ITER_CHILDREN)
+ return css_next_child(curr, p->start_css);
else /* BPF_CGROUP_ITER_SELF_ONLY */
return NULL;
}
@@ -200,11 +205,16 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
int order = linfo->cgroup.order;
struct cgroup *cgrp;
- if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE &&
- order != BPF_CGROUP_ITER_DESCENDANTS_POST &&
- order != BPF_CGROUP_ITER_ANCESTORS_UP &&
- order != BPF_CGROUP_ITER_SELF_ONLY)
+ switch (order) {
+ case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+ case BPF_CGROUP_ITER_DESCENDANTS_POST:
+ case BPF_CGROUP_ITER_ANCESTORS_UP:
+ case BPF_CGROUP_ITER_SELF_ONLY:
+ case BPF_CGROUP_ITER_CHILDREN:
+ break;
+ default:
return -EINVAL;
+ }
if (fd && id)
return -EINVAL;
@@ -257,6 +267,8 @@ show_order:
seq_puts(seq, "order: descendants_post\n");
else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP)
seq_puts(seq, "order: ancestors_up\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_CHILDREN)
+ seq_puts(seq, "order: children\n");
else /* BPF_CGROUP_ITER_SELF_ONLY */
seq_puts(seq, "order: self_only\n");
}
@@ -320,6 +332,7 @@ __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
case BPF_CGROUP_ITER_DESCENDANTS_PRE:
case BPF_CGROUP_ITER_DESCENDANTS_POST:
case BPF_CGROUP_ITER_ANCESTORS_UP:
+ case BPF_CGROUP_ITER_CHILDREN:
break;
default:
return -EINVAL;
@@ -345,6 +358,9 @@ __bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *i
case BPF_CGROUP_ITER_DESCENDANTS_POST:
kit->pos = css_next_descendant_post(kit->pos, kit->start);
break;
+ case BPF_CGROUP_ITER_CHILDREN:
+ kit->pos = css_next_child(kit->pos, kit->start);
+ break;
case BPF_CGROUP_ITER_ANCESTORS_UP:
kit->pos = kit->pos ? kit->pos->parent : kit->start;
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1b9b18e5b03c..5ab6bace7d0d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -25,6 +25,7 @@
#include <linux/prandom.h>
#include <linux/bpf.h>
#include <linux/btf.h>
+#include <linux/hex.h>
#include <linux/objtool.h>
#include <linux/overflow.h>
#include <linux/rbtree_latch.h>
@@ -112,7 +113,8 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
vfree(fp);
return NULL;
}
- fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
+ fp->active = __alloc_percpu_gfp(sizeof(u8[BPF_NR_CONTEXTS]), 4,
+ bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
if (!fp->active) {
vfree(fp);
kfree(aux);
@@ -136,6 +138,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
mutex_init(&fp->aux->used_maps_mutex);
mutex_init(&fp->aux->ext_mutex);
mutex_init(&fp->aux->dst_mutex);
+ mutex_init(&fp->aux->st_ops_assoc_mutex);
#ifdef CONFIG_BPF_SYSCALL
bpf_prog_stream_init(fp);
@@ -286,6 +289,7 @@ void __bpf_prog_free(struct bpf_prog *fp)
if (fp->aux) {
mutex_destroy(&fp->aux->used_maps_mutex);
mutex_destroy(&fp->aux->dst_mutex);
+ mutex_destroy(&fp->aux->st_ops_assoc_mutex);
kfree(fp->aux->poke_tab);
kfree(fp->aux);
}
@@ -713,8 +717,8 @@ static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
return n ? container_of(n, struct bpf_ksym, tnode) : NULL;
}
-int __bpf_address_lookup(unsigned long addr, unsigned long *size,
- unsigned long *off, char *sym)
+int bpf_address_lookup(unsigned long addr, unsigned long *size,
+ unsigned long *off, char *sym)
{
struct bpf_ksym *ksym;
int ret = 0;
@@ -2398,6 +2402,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
map->owner->type = prog_type;
map->owner->jited = fp->jited;
map->owner->xdp_has_frags = aux->xdp_has_frags;
+ map->owner->sleepable = fp->sleepable;
map->owner->expected_attach_type = fp->expected_attach_type;
map->owner->attach_func_proto = aux->attach_func_proto;
for_each_cgroup_storage_type(i) {
@@ -2409,7 +2414,8 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
} else {
ret = map->owner->type == prog_type &&
map->owner->jited == fp->jited &&
- map->owner->xdp_has_frags == aux->xdp_has_frags;
+ map->owner->xdp_has_frags == aux->xdp_has_frags &&
+ map->owner->sleepable == fp->sleepable;
if (ret &&
map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
map->owner->expected_attach_type != fp->expected_attach_type)
@@ -2912,6 +2918,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
#endif
bpf_free_used_maps(aux);
bpf_free_used_btfs(aux);
+ bpf_prog_disassoc_struct_ops(aux->prog);
if (bpf_prog_is_dev_bound(aux))
bpf_prog_dev_bound_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
@@ -3138,6 +3145,11 @@ bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
return false;
}
+bool __weak bpf_jit_supports_fsession(void)
+{
+ return false;
+}
+
u64 __weak bpf_arch_uaddress_limit(void)
{
#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 703e5df1f4ef..04171fbc39cb 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -430,7 +430,7 @@ static struct bpf_cpu_map_entry *
__cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
u32 cpu)
{
- int numa, err, i, fd = value->bpf_prog.fd;
+ int numa, err = -ENOMEM, i, fd = value->bpf_prog.fd;
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct bpf_cpu_map_entry *rcpu;
struct xdp_bulk_queue *bq;
@@ -440,7 +440,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa);
if (!rcpu)
- return NULL;
+ return ERR_PTR(err);
/* Alloc percpu bulkq */
rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq),
@@ -468,16 +468,21 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
rcpu->value.qsize = value->qsize;
gro_init(&rcpu->gro);
- if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
- goto free_ptr_ring;
+ if (fd > 0) {
+ err = __cpu_map_load_bpf_program(rcpu, map, fd);
+ if (err)
+ goto free_ptr_ring;
+ }
/* Setup kthread */
init_completion(&rcpu->kthread_running);
rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
"cpumap/%d/map:%d", cpu,
map->id);
- if (IS_ERR(rcpu->kthread))
+ if (IS_ERR(rcpu->kthread)) {
+ err = PTR_ERR(rcpu->kthread);
goto free_prog;
+ }
/* Make sure kthread runs on a single CPU */
kthread_bind(rcpu->kthread, cpu);
@@ -503,7 +508,7 @@ free_bulkq:
free_percpu(rcpu->bulkq);
free_rcu:
kfree(rcpu);
- return NULL;
+ return ERR_PTR(err);
}
static void __cpu_map_entry_free(struct work_struct *work)
@@ -596,8 +601,8 @@ static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
} else {
/* Updating qsize cause re-allocation of bpf_cpu_map_entry */
rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu);
- if (!rcpu)
- return -ENOMEM;
+ if (IS_ERR(rcpu))
+ return PTR_ERR(rcpu);
}
rcu_read_lock();
__cpu_map_entry_replace(cmap, key_cpu, rcpu);
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 9876c5fe6c2a..b8c805b4b06a 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -477,7 +477,7 @@ __bpf_kfunc_end_defs();
BTF_KFUNCS_START(cpumask_kfunc_btf_ids)
BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE)
BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU)
diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c
index 83c4d9943084..7e75a1936256 100644
--- a/kernel/bpf/crypto.c
+++ b/kernel/bpf/crypto.c
@@ -60,7 +60,7 @@ struct bpf_crypto_ctx {
int bpf_crypto_register_type(const struct bpf_crypto_type *type)
{
struct bpf_crypto_type_list *node;
- int err = -EEXIST;
+ int err = -EBUSY;
down_write(&bpf_crypto_types_sem);
list_for_each_entry(node, &bpf_crypto_types, list) {
@@ -261,6 +261,12 @@ __bpf_kfunc void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx)
call_rcu(&ctx->rcu, crypto_free_cb);
}
+__bpf_kfunc void bpf_crypto_ctx_release_dtor(void *ctx)
+{
+ bpf_crypto_ctx_release(ctx);
+}
+CFI_NOSEAL(bpf_crypto_ctx_release_dtor);
+
static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,
const struct bpf_dynptr_kern *src,
const struct bpf_dynptr_kern *dst,
@@ -368,7 +374,7 @@ static const struct btf_kfunc_id_set crypt_kfunc_set = {
BTF_ID_LIST(bpf_crypto_dtor_ids)
BTF_ID(struct, bpf_crypto_ctx)
-BTF_ID(func, bpf_crypto_ctx_release)
+BTF_ID(func, bpf_crypto_ctx_release_dtor)
static int __init crypto_kfunc_init(void)
{
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index c8a9b27f8663..3b9d297a53be 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -82,9 +82,6 @@ struct bucket {
rqspinlock_t raw_lock;
};
-#define HASHTAB_MAP_LOCK_COUNT 8
-#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
-
struct bpf_htab {
struct bpf_map map;
struct bpf_mem_alloc ma;
@@ -932,7 +929,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
}
static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
- void *value, bool onallcpus)
+ void *value, bool onallcpus, u64 map_flags)
{
void *ptr;
@@ -943,19 +940,28 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
bpf_obj_free_fields(htab->map.record, ptr);
} else {
u32 size = round_up(htab->map.value_size, 8);
- int off = 0, cpu;
+ void *val;
+ int cpu;
+
+ if (map_flags & BPF_F_CPU) {
+ cpu = map_flags >> 32;
+ ptr = per_cpu_ptr(pptr, cpu);
+ copy_map_value(&htab->map, ptr, value);
+ bpf_obj_free_fields(htab->map.record, ptr);
+ return;
+ }
for_each_possible_cpu(cpu) {
ptr = per_cpu_ptr(pptr, cpu);
- copy_map_value_long(&htab->map, ptr, value + off);
+ val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
+ copy_map_value(&htab->map, ptr, val);
bpf_obj_free_fields(htab->map.record, ptr);
- off += size;
}
}
}
static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
- void *value, bool onallcpus)
+ void *value, bool onallcpus, u64 map_flags)
{
/* When not setting the initial value on all cpus, zero-fill element
* values for other cpus. Otherwise, bpf program has no way to ensure
@@ -973,7 +979,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
}
} else {
- pcpu_copy_value(htab, pptr, value, onallcpus);
+ pcpu_copy_value(htab, pptr, value, onallcpus, map_flags);
}
}
@@ -985,7 +991,7 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
- struct htab_elem *old_elem)
+ struct htab_elem *old_elem, u64 map_flags)
{
u32 size = htab->map.value_size;
bool prealloc = htab_is_prealloc(htab);
@@ -1043,7 +1049,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
pptr = *(void __percpu **)ptr;
}
- pcpu_init_value(htab, pptr, value, onallcpus);
+ pcpu_init_value(htab, pptr, value, onallcpus, map_flags);
if (!prealloc)
htab_elem_set_ptr(l_new, key_size, pptr);
@@ -1147,7 +1153,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
}
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
- l_old);
+ l_old, map_flags);
if (IS_ERR(l_new)) {
/* all pre-allocated elements are in use or memory exhausted */
ret = PTR_ERR(l_new);
@@ -1249,6 +1255,15 @@ err_lock_bucket:
return ret;
}
+static int htab_map_check_update_flags(bool onallcpus, u64 map_flags)
+{
+ if (unlikely(!onallcpus && map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(onallcpus && ((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS)))
+ return -EINVAL;
+ return 0;
+}
+
static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
void *value, u64 map_flags,
bool percpu, bool onallcpus)
@@ -1262,9 +1277,9 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
u32 key_size, hash;
int ret;
- if (unlikely(map_flags > BPF_EXIST))
- /* unknown flags */
- return -EINVAL;
+ ret = htab_map_check_update_flags(onallcpus, map_flags);
+ if (unlikely(ret))
+ return ret;
WARN_ON_ONCE(!bpf_rcu_lock_held());
@@ -1289,7 +1304,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
/* Update value in-place */
if (percpu) {
pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
- value, onallcpus);
+ value, onallcpus, map_flags);
} else {
void **inner_map_pptr = htab_elem_value(l_old, key_size);
@@ -1298,7 +1313,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
}
} else {
l_new = alloc_htab_elem(htab, key, value, key_size,
- hash, percpu, onallcpus, NULL);
+ hash, percpu, onallcpus, NULL, map_flags);
if (IS_ERR(l_new)) {
ret = PTR_ERR(l_new);
goto err;
@@ -1324,9 +1339,9 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
u32 key_size, hash;
int ret;
- if (unlikely(map_flags > BPF_EXIST))
- /* unknown flags */
- return -EINVAL;
+ ret = htab_map_check_update_flags(onallcpus, map_flags);
+ if (unlikely(ret))
+ return ret;
WARN_ON_ONCE(!bpf_rcu_lock_held());
@@ -1363,10 +1378,10 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
/* per-cpu hash map can update value in-place */
pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
- value, onallcpus);
+ value, onallcpus, map_flags);
} else {
pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
- value, onallcpus);
+ value, onallcpus, map_flags);
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
l_new = NULL;
}
@@ -1678,9 +1693,9 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
u32 batch, max_count, size, bucket_size, map_id;
+ u64 elem_map_flags, map_flags, allowed_flags;
u32 bucket_cnt, total, key_size, value_size;
struct htab_elem *node_to_free = NULL;
- u64 elem_map_flags, map_flags;
struct hlist_nulls_head *head;
struct hlist_nulls_node *n;
unsigned long flags = 0;
@@ -1690,9 +1705,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
int ret = 0;
elem_map_flags = attr->batch.elem_flags;
- if ((elem_map_flags & ~BPF_F_LOCK) ||
- ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
- return -EINVAL;
+ allowed_flags = BPF_F_LOCK;
+ if (!do_delete && is_percpu)
+ allowed_flags |= BPF_F_CPU;
+ ret = bpf_map_check_op_flags(map, elem_map_flags, allowed_flags);
+ if (ret)
+ return ret;
map_flags = attr->batch.flags;
if (map_flags)
@@ -1715,7 +1733,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
key_size = htab->map.key_size;
value_size = htab->map.value_size;
size = round_up(value_size, 8);
- if (is_percpu)
+ if (is_percpu && !(elem_map_flags & BPF_F_CPU))
value_size = size * num_possible_cpus();
total = 0;
/* while experimenting with hash tables with sizes ranging from 10 to
@@ -1798,10 +1816,17 @@ again_nocopy:
void __percpu *pptr;
pptr = htab_elem_get_ptr(l, map->key_size);
- for_each_possible_cpu(cpu) {
- copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu));
- check_and_init_map_value(&htab->map, dst_val + off);
- off += size;
+ if (elem_map_flags & BPF_F_CPU) {
+ cpu = elem_map_flags >> 32;
+ copy_map_value(&htab->map, dst_val, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, dst_val);
+ } else {
+ for_each_possible_cpu(cpu) {
+ copy_map_value_long(&htab->map, dst_val + off,
+ per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, dst_val + off);
+ off += size;
+ }
}
} else {
value = htab_elem_value(l, key_size);
@@ -2209,11 +2234,11 @@ static u64 htab_map_mem_usage(const struct bpf_map *map)
bool prealloc = htab_is_prealloc(htab);
bool percpu = htab_is_percpu(htab);
bool lru = htab_is_lru(htab);
- u64 num_entries;
- u64 usage = sizeof(struct bpf_htab);
+ u64 num_entries, usage;
+
+ usage = sizeof(struct bpf_htab) +
+ sizeof(struct bucket) * htab->n_buckets;
- usage += sizeof(struct bucket) * htab->n_buckets;
- usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT;
if (prealloc) {
num_entries = map->max_entries;
if (htab_has_extra_elems(htab))
@@ -2357,7 +2382,7 @@ static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *k
return NULL;
}
-int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
+int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 map_flags)
{
struct htab_elem *l;
void __percpu *pptr;
@@ -2374,16 +2399,22 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
l = __htab_map_lookup_elem(map, key);
if (!l)
goto out;
+ ret = 0;
/* We do not mark LRU map element here in order to not mess up
* eviction heuristics when user space does a map walk.
*/
pptr = htab_elem_get_ptr(l, map->key_size);
+ if (map_flags & BPF_F_CPU) {
+ cpu = map_flags >> 32;
+ copy_map_value(map, value, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, value);
+ goto out;
+ }
for_each_possible_cpu(cpu) {
copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
check_and_init_map_value(map, value + off);
off += size;
}
- ret = 0;
out:
rcu_read_unlock();
return ret;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index db72b96f9c8c..7ac32798eb04 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1077,7 +1077,7 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.func = bpf_snprintf,
.gpl_only = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg1_type = ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_PTR_TO_CONST_STR,
.arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
@@ -1095,16 +1095,34 @@ static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx)
return (void *)value - round_up(map->key_size, 8);
}
+enum bpf_async_type {
+ BPF_ASYNC_TYPE_TIMER = 0,
+ BPF_ASYNC_TYPE_WQ,
+};
+
+enum bpf_async_op {
+ BPF_ASYNC_START,
+ BPF_ASYNC_CANCEL
+};
+
+struct bpf_async_cmd {
+ struct llist_node node;
+ u64 nsec;
+ u32 mode;
+ enum bpf_async_op op;
+};
+
struct bpf_async_cb {
struct bpf_map *map;
struct bpf_prog *prog;
void __rcu *callback_fn;
void *value;
- union {
- struct rcu_head rcu;
- struct work_struct delete_work;
- };
+ struct rcu_head rcu;
u64 flags;
+ struct irq_work worker;
+ refcount_t refcnt;
+ enum bpf_async_type type;
+ struct llist_head async_cmds;
};
/* BPF map elements can contain 'struct bpf_timer'.
@@ -1132,7 +1150,6 @@ struct bpf_hrtimer {
struct bpf_work {
struct bpf_async_cb cb;
struct work_struct work;
- struct work_struct delete_work;
};
/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */
@@ -1142,20 +1159,12 @@ struct bpf_async_kern {
struct bpf_hrtimer *timer;
struct bpf_work *work;
};
- /* bpf_spin_lock is used here instead of spinlock_t to make
- * sure that it always fits into space reserved by struct bpf_timer
- * regardless of LOCKDEP and spinlock debug flags.
- */
- struct bpf_spin_lock lock;
} __attribute__((aligned(8)));
-enum bpf_async_type {
- BPF_ASYNC_TYPE_TIMER = 0,
- BPF_ASYNC_TYPE_WQ,
-};
-
static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
+static void bpf_async_refcount_put(struct bpf_async_cb *cb);
+
static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
{
struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
@@ -1219,45 +1228,85 @@ static void bpf_async_cb_rcu_free(struct rcu_head *rcu)
{
struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
+ /*
+ * Drop the last reference to prog only after RCU GP, as set_callback()
+ * may race with cancel_and_free()
+ */
+ if (cb->prog)
+ bpf_prog_put(cb->prog);
+
kfree_nolock(cb);
}
-static void bpf_wq_delete_work(struct work_struct *work)
+/* Callback from call_rcu_tasks_trace, chains to call_rcu for final free */
+static void bpf_async_cb_rcu_tasks_trace_free(struct rcu_head *rcu)
{
- struct bpf_work *w = container_of(work, struct bpf_work, delete_work);
+ struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
+ struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);
+ struct bpf_work *w = container_of(cb, struct bpf_work, cb);
+ bool retry = false;
- cancel_work_sync(&w->work);
+ /*
+ * bpf_async_cancel_and_free() tried to cancel timer/wq, but it
+ * could have raced with timer/wq_start. Now refcnt is zero and
+ * srcu/rcu GP completed. Cancel timer/wq again.
+ */
+ switch (cb->type) {
+ case BPF_ASYNC_TYPE_TIMER:
+ if (hrtimer_try_to_cancel(&t->timer) < 0)
+ retry = true;
+ break;
+ case BPF_ASYNC_TYPE_WQ:
+ if (!cancel_work(&w->work) && work_busy(&w->work))
+ retry = true;
+ break;
+ }
+ if (retry) {
+ /*
+ * hrtimer or wq callback may still be running. It must be
+ * in rcu_tasks_trace or rcu CS, so wait for GP again.
+ * It won't retry forever, since refcnt zero prevents all
+ * operations on timer/wq.
+ */
+ call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
+ return;
+ }
- call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free);
+ /* rcu_trace_implies_rcu_gp() is true and will remain so */
+ bpf_async_cb_rcu_free(rcu);
}
-static void bpf_timer_delete_work(struct work_struct *work)
+static void worker_for_call_rcu(struct irq_work *work)
{
- struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work);
+ struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker);
- /* Cancel the timer and wait for callback to complete if it was running.
- * If hrtimer_cancel() can be safely called it's safe to call
- * call_rcu() right after for both preallocated and non-preallocated
- * maps. The async->cb = NULL was already done and no code path can see
- * address 't' anymore. Timer if armed for existing bpf_hrtimer before
- * bpf_timer_cancel_and_free will have been cancelled.
- */
- hrtimer_cancel(&t->timer);
- call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
+ call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
}
+static void bpf_async_refcount_put(struct bpf_async_cb *cb)
+{
+ if (!refcount_dec_and_test(&cb->refcnt))
+ return;
+
+ if (irqs_disabled()) {
+ cb->worker = IRQ_WORK_INIT(worker_for_call_rcu);
+ irq_work_queue(&cb->worker);
+ } else {
+ call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
+ }
+}
+
+static void bpf_async_cancel_and_free(struct bpf_async_kern *async);
+static void bpf_async_irq_worker(struct irq_work *work);
+
static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
enum bpf_async_type type)
{
- struct bpf_async_cb *cb;
+ struct bpf_async_cb *cb, *old_cb;
struct bpf_hrtimer *t;
struct bpf_work *w;
clockid_t clockid;
size_t size;
- int ret = 0;
-
- if (in_nmi())
- return -EOPNOTSUPP;
switch (type) {
case BPF_ASYNC_TYPE_TIMER:
@@ -1270,18 +1319,13 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
return -EINVAL;
}
- __bpf_spin_lock_irqsave(&async->lock);
- t = async->timer;
- if (t) {
- ret = -EBUSY;
- goto out;
- }
+ old_cb = READ_ONCE(async->cb);
+ if (old_cb)
+ return -EBUSY;
cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node);
- if (!cb) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!cb)
+ return -ENOMEM;
switch (type) {
case BPF_ASYNC_TYPE_TIMER:
@@ -1289,7 +1333,6 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
t = (struct bpf_hrtimer *)cb;
atomic_set(&t->cancelling, 0);
- INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work);
hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT);
cb->value = (void *)async - map->record->timer_off;
break;
@@ -1297,16 +1340,24 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
w = (struct bpf_work *)cb;
INIT_WORK(&w->work, bpf_wq_work);
- INIT_WORK(&w->delete_work, bpf_wq_delete_work);
cb->value = (void *)async - map->record->wq_off;
break;
}
cb->map = map;
cb->prog = NULL;
cb->flags = flags;
+ cb->worker = IRQ_WORK_INIT(bpf_async_irq_worker);
+ init_llist_head(&cb->async_cmds);
+ refcount_set(&cb->refcnt, 1); /* map's reference */
+ cb->type = type;
rcu_assign_pointer(cb->callback_fn, NULL);
- WRITE_ONCE(async->cb, cb);
+ old_cb = cmpxchg(&async->cb, NULL, cb);
+ if (old_cb) {
+ /* Lost the race to initialize this bpf_async_kern, drop the allocated object */
+ kfree_nolock(cb);
+ return -EBUSY;
+ }
/* Guarantee the order between async->cb and map->usercnt. So
* when there are concurrent uref release and bpf timer init, either
* bpf_timer_cancel_and_free() called by uref release reads a no-NULL
@@ -1317,13 +1368,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
/* maps with timers must be either held by user space
* or pinned in bpffs.
*/
- WRITE_ONCE(async->cb, NULL);
- kfree_nolock(cb);
- ret = -EPERM;
+ bpf_async_cancel_and_free(async);
+ return -EPERM;
}
-out:
- __bpf_spin_unlock_irqrestore(&async->lock);
- return ret;
+
+ return 0;
}
BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map,
@@ -1354,56 +1403,90 @@ static const struct bpf_func_proto bpf_timer_init_proto = {
.arg3_type = ARG_ANYTHING,
};
-static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn,
- struct bpf_prog_aux *aux, unsigned int flags,
- enum bpf_async_type type)
+static int bpf_async_update_prog_callback(struct bpf_async_cb *cb,
+ struct bpf_prog *prog,
+ void *callback_fn)
{
- struct bpf_prog *prev, *prog = aux->prog;
- struct bpf_async_cb *cb;
- int ret = 0;
+ struct bpf_prog *prev;
- if (in_nmi())
- return -EOPNOTSUPP;
- __bpf_spin_lock_irqsave(&async->lock);
- cb = async->cb;
- if (!cb) {
- ret = -EINVAL;
- goto out;
- }
- if (!atomic64_read(&cb->map->usercnt)) {
- /* maps with timers must be either held by user space
- * or pinned in bpffs. Otherwise timer might still be
- * running even when bpf prog is detached and user space
- * is gone, since map_release_uref won't ever be called.
- */
- ret = -EPERM;
- goto out;
+ /* Acquire a guard reference on prog to prevent it from being freed during the loop */
+ if (prog) {
+ prog = bpf_prog_inc_not_zero(prog);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
}
- prev = cb->prog;
- if (prev != prog) {
- /* Bump prog refcnt once. Every bpf_timer_set_callback()
- * can pick different callback_fn-s within the same prog.
+
+ do {
+ if (prog)
+ prog = bpf_prog_inc_not_zero(prog);
+ prev = xchg(&cb->prog, prog);
+ rcu_assign_pointer(cb->callback_fn, callback_fn);
+
+ /*
+ * Release previous prog, make sure that if other CPU is contending,
+ * to set bpf_prog, references are not leaked as each iteration acquires and
+ * releases one reference.
*/
- prog = bpf_prog_inc_not_zero(prog);
- if (IS_ERR(prog)) {
- ret = PTR_ERR(prog);
- goto out;
- }
if (prev)
- /* Drop prev prog refcnt when swapping with new prog */
bpf_prog_put(prev);
- cb->prog = prog;
+
+ } while (READ_ONCE(cb->prog) != prog ||
+ (void __force *)READ_ONCE(cb->callback_fn) != callback_fn);
+
+ if (prog)
+ bpf_prog_put(prog);
+
+ return 0;
+}
+
+static DEFINE_PER_CPU(struct bpf_async_cb *, async_cb_running);
+
+static int bpf_async_schedule_op(struct bpf_async_cb *cb, enum bpf_async_op op,
+ u64 nsec, u32 timer_mode)
+{
+ /*
+ * Do not schedule another operation on this cpu if it's in irq_work
+ * callback that is processing async_cmds queue. Otherwise the following
+ * loop is possible:
+ * bpf_timer_start() -> bpf_async_schedule_op() -> irq_work_queue().
+ * irqrestore -> bpf_async_irq_worker() -> tracepoint -> bpf_timer_start().
+ */
+ if (this_cpu_read(async_cb_running) == cb) {
+ bpf_async_refcount_put(cb);
+ return -EDEADLK;
}
- rcu_assign_pointer(cb->callback_fn, callback_fn);
-out:
- __bpf_spin_unlock_irqrestore(&async->lock);
- return ret;
+
+ struct bpf_async_cmd *cmd = kmalloc_nolock(sizeof(*cmd), 0, NUMA_NO_NODE);
+
+ if (!cmd) {
+ bpf_async_refcount_put(cb);
+ return -ENOMEM;
+ }
+ init_llist_node(&cmd->node);
+ cmd->nsec = nsec;
+ cmd->mode = timer_mode;
+ cmd->op = op;
+ if (llist_add(&cmd->node, &cb->async_cmds))
+ irq_work_queue(&cb->worker);
+ return 0;
+}
+
+static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn,
+ struct bpf_prog *prog)
+{
+ struct bpf_async_cb *cb;
+
+ cb = READ_ONCE(async->cb);
+ if (!cb)
+ return -EINVAL;
+
+ return bpf_async_update_prog_callback(cb, prog, callback_fn);
}
BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn,
struct bpf_prog_aux *, aux)
{
- return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER);
+ return __bpf_async_set_callback(timer, callback_fn, aux->prog);
}
static const struct bpf_func_proto bpf_timer_set_callback_proto = {
@@ -1414,22 +1497,22 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = {
.arg2_type = ARG_PTR_TO_FUNC,
};
-BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags)
+static bool defer_timer_wq_op(void)
+{
+ return in_hardirq() || irqs_disabled();
+}
+
+BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, flags)
{
struct bpf_hrtimer *t;
- int ret = 0;
- enum hrtimer_mode mode;
+ u32 mode;
- if (in_nmi())
- return -EOPNOTSUPP;
if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN))
return -EINVAL;
- __bpf_spin_lock_irqsave(&timer->lock);
- t = timer->timer;
- if (!t || !t->cb.prog) {
- ret = -EINVAL;
- goto out;
- }
+
+ t = READ_ONCE(async->timer);
+ if (!t || !READ_ONCE(t->cb.prog))
+ return -EINVAL;
if (flags & BPF_F_TIMER_ABS)
mode = HRTIMER_MODE_ABS_SOFT;
@@ -1439,10 +1522,20 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, fla
if (flags & BPF_F_TIMER_CPU_PIN)
mode |= HRTIMER_MODE_PINNED;
- hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
-out:
- __bpf_spin_unlock_irqrestore(&timer->lock);
- return ret;
+ /*
+ * bpf_async_cancel_and_free() could have dropped refcnt to zero. In
+ * such case BPF progs are not allowed to arm the timer to prevent UAF.
+ */
+ if (!refcount_inc_not_zero(&t->cb.refcnt))
+ return -ENOENT;
+
+ if (!defer_timer_wq_op()) {
+ hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
+ bpf_async_refcount_put(&t->cb);
+ return 0;
+ } else {
+ return bpf_async_schedule_op(&t->cb, BPF_ASYNC_START, nsecs, mode);
+ }
}
static const struct bpf_func_proto bpf_timer_start_proto = {
@@ -1454,32 +1547,18 @@ static const struct bpf_func_proto bpf_timer_start_proto = {
.arg3_type = ARG_ANYTHING,
};
-static void drop_prog_refcnt(struct bpf_async_cb *async)
-{
- struct bpf_prog *prog = async->prog;
-
- if (prog) {
- bpf_prog_put(prog);
- async->prog = NULL;
- rcu_assign_pointer(async->callback_fn, NULL);
- }
-}
-
-BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
+BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async)
{
struct bpf_hrtimer *t, *cur_t;
bool inc = false;
int ret = 0;
- if (in_nmi())
+ if (defer_timer_wq_op())
return -EOPNOTSUPP;
- rcu_read_lock();
- __bpf_spin_lock_irqsave(&timer->lock);
- t = timer->timer;
- if (!t) {
- ret = -EINVAL;
- goto out;
- }
+
+ t = READ_ONCE(async->timer);
+ if (!t)
+ return -EINVAL;
cur_t = this_cpu_read(hrtimer_running);
if (cur_t == t) {
@@ -1487,8 +1566,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
* its own timer the hrtimer_cancel() will deadlock
* since it waits for callback_fn to finish.
*/
- ret = -EDEADLK;
- goto out;
+ return -EDEADLK;
}
/* Only account in-flight cancellations when invoked from a timer
@@ -1511,20 +1589,17 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
* cancelling and waiting for it synchronously, since it might
* do the same. Bail!
*/
- ret = -EDEADLK;
- goto out;
+ atomic_dec(&t->cancelling);
+ return -EDEADLK;
}
drop:
- drop_prog_refcnt(&t->cb);
-out:
- __bpf_spin_unlock_irqrestore(&timer->lock);
+ bpf_async_update_prog_callback(&t->cb, NULL, NULL);
/* Cancel the timer and wait for associated callback to finish
* if it was running.
*/
- ret = ret ?: hrtimer_cancel(&t->timer);
+ ret = hrtimer_cancel(&t->timer);
if (inc)
atomic_dec(&t->cancelling);
- rcu_read_unlock();
return ret;
}
@@ -1535,107 +1610,107 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = {
.arg1_type = ARG_PTR_TO_TIMER,
};
-static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async)
+static void bpf_async_process_op(struct bpf_async_cb *cb, u32 op,
+ u64 timer_nsec, u32 timer_mode)
{
- struct bpf_async_cb *cb;
+ switch (cb->type) {
+ case BPF_ASYNC_TYPE_TIMER: {
+ struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);
- /* Performance optimization: read async->cb without lock first. */
- if (!READ_ONCE(async->cb))
- return NULL;
+ switch (op) {
+ case BPF_ASYNC_START:
+ hrtimer_start(&t->timer, ns_to_ktime(timer_nsec), timer_mode);
+ break;
+ case BPF_ASYNC_CANCEL:
+ hrtimer_try_to_cancel(&t->timer);
+ break;
+ }
+ break;
+ }
+ case BPF_ASYNC_TYPE_WQ: {
+ struct bpf_work *w = container_of(cb, struct bpf_work, cb);
+
+ switch (op) {
+ case BPF_ASYNC_START:
+ schedule_work(&w->work);
+ break;
+ case BPF_ASYNC_CANCEL:
+ cancel_work(&w->work);
+ break;
+ }
+ break;
+ }
+ }
+ bpf_async_refcount_put(cb);
+}
- __bpf_spin_lock_irqsave(&async->lock);
- /* re-read it under lock */
- cb = async->cb;
- if (!cb)
- goto out;
- drop_prog_refcnt(cb);
- /* The subsequent bpf_timer_start/cancel() helpers won't be able to use
- * this timer, since it won't be initialized.
- */
- WRITE_ONCE(async->cb, NULL);
-out:
- __bpf_spin_unlock_irqrestore(&async->lock);
- return cb;
+static void bpf_async_irq_worker(struct irq_work *work)
+{
+ struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker);
+ struct llist_node *pos, *n, *list;
+
+ list = llist_del_all(&cb->async_cmds);
+ if (!list)
+ return;
+
+ list = llist_reverse_order(list);
+ this_cpu_write(async_cb_running, cb);
+ llist_for_each_safe(pos, n, list) {
+ struct bpf_async_cmd *cmd;
+
+ cmd = container_of(pos, struct bpf_async_cmd, node);
+ bpf_async_process_op(cb, cmd->op, cmd->nsec, cmd->mode);
+ kfree_nolock(cmd);
+ }
+ this_cpu_write(async_cb_running, NULL);
}
-/* This function is called by map_delete/update_elem for individual element and
- * by ops->map_release_uref when the user space reference to a map reaches zero.
- */
-void bpf_timer_cancel_and_free(void *val)
+static void bpf_async_cancel_and_free(struct bpf_async_kern *async)
{
- struct bpf_hrtimer *t;
+ struct bpf_async_cb *cb;
- t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val);
+ if (!READ_ONCE(async->cb))
+ return;
- if (!t)
+ cb = xchg(&async->cb, NULL);
+ if (!cb)
return;
- /* We check that bpf_map_delete/update_elem() was called from timer
- * callback_fn. In such case we don't call hrtimer_cancel() (since it
- * will deadlock) and don't call hrtimer_try_to_cancel() (since it will
- * just return -1). Though callback_fn is still running on this cpu it's
- * safe to do kfree(t) because bpf_timer_cb() read everything it needed
- * from 't'. The bpf subprog callback_fn won't be able to access 't',
- * since async->cb = NULL was already done. The timer will be
- * effectively cancelled because bpf_timer_cb() will return
- * HRTIMER_NORESTART.
- *
- * However, it is possible the timer callback_fn calling us armed the
- * timer _before_ calling us, such that failing to cancel it here will
- * cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
- * Therefore, we _need_ to cancel any outstanding timers before we do
- * call_rcu, even though no more timers can be armed.
- *
- * Moreover, we need to schedule work even if timer does not belong to
- * the calling callback_fn, as on two different CPUs, we can end up in a
- * situation where both sides run in parallel, try to cancel one
- * another, and we end up waiting on both sides in hrtimer_cancel
- * without making forward progress, since timer1 depends on time2
- * callback to finish, and vice versa.
- *
- * CPU 1 (timer1_cb) CPU 2 (timer2_cb)
- * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1)
- *
- * To avoid these issues, punt to workqueue context when we are in a
- * timer callback.
+
+ bpf_async_update_prog_callback(cb, NULL, NULL);
+ /*
+ * No refcount_inc_not_zero(&cb->refcnt) here. Dropping the last
+ * refcnt. Either synchronously or asynchronously in irq_work.
*/
- if (this_cpu_read(hrtimer_running)) {
- queue_work(system_dfl_wq, &t->cb.delete_work);
- return;
- }
- if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
- /* If the timer is running on other CPU, also use a kworker to
- * wait for the completion of the timer instead of trying to
- * acquire a sleepable lock in hrtimer_cancel() to wait for its
- * completion.
- */
- if (hrtimer_try_to_cancel(&t->timer) >= 0)
- call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
- else
- queue_work(system_dfl_wq, &t->cb.delete_work);
+ if (!defer_timer_wq_op()) {
+ bpf_async_process_op(cb, BPF_ASYNC_CANCEL, 0, 0);
} else {
- bpf_timer_delete_work(&t->cb.delete_work);
+ (void)bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0);
+ /*
+ * bpf_async_schedule_op() either enqueues allocated cmd into llist
+ * or fails with ENOMEM and drop the last refcnt.
+ * This is unlikely, but safe, since bpf_async_cb_rcu_tasks_trace_free()
+ * callback will do additional timer/wq_cancel due to races anyway.
+ */
}
}
-/* This function is called by map_delete/update_elem for individual element and
+/*
+ * This function is called by map_delete/update_elem for individual element and
* by ops->map_release_uref when the user space reference to a map reaches zero.
*/
-void bpf_wq_cancel_and_free(void *val)
+void bpf_timer_cancel_and_free(void *val)
{
- struct bpf_work *work;
-
- BTF_TYPE_EMIT(struct bpf_wq);
+ bpf_async_cancel_and_free(val);
+}
- work = (struct bpf_work *)__bpf_async_cancel_and_free(val);
- if (!work)
- return;
- /* Trigger cancel of the sleepable work, but *do not* wait for
- * it to finish if it was running as we might not be in a
- * sleepable context.
- * kfree will be called once the work has finished.
- */
- schedule_work(&work->delete_work);
+/*
+ * This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_wq_cancel_and_free(void *val)
+{
+ bpf_async_cancel_and_free(val);
}
BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr)
@@ -2092,12 +2167,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_cgroup_classid_curr_proto;
#endif
case BPF_FUNC_task_storage_get:
- if (bpf_prog_check_recur(prog))
- return &bpf_task_storage_get_recur_proto;
return &bpf_task_storage_get_proto;
case BPF_FUNC_task_storage_delete:
- if (bpf_prog_check_recur(prog))
- return &bpf_task_storage_delete_recur_proto;
return &bpf_task_storage_delete_proto;
default:
break;
@@ -2709,14 +2780,14 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
* bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
* @p: The dynptr whose data slice to retrieve
* @offset: Offset into the dynptr
- * @buffer__opt: User-provided buffer to copy contents into. May be NULL
+ * @buffer__nullable: User-provided buffer to copy contents into. May be NULL
* @buffer__szk: Size (in bytes) of the buffer if present. This is the
* length of the requested slice. This must be a constant.
*
* For non-skb and non-xdp type dynptrs, there is no difference between
* bpf_dynptr_slice and bpf_dynptr_data.
*
- * If buffer__opt is NULL, the call will fail if buffer_opt was needed.
+ * If buffer__nullable is NULL, the call will fail if buffer_opt was needed.
*
* If the intention is to write to the data slice, please use
* bpf_dynptr_slice_rdwr.
@@ -2734,7 +2805,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
* direct pointer)
*/
__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
- void *buffer__opt, u64 buffer__szk)
+ void *buffer__nullable, u64 buffer__szk)
{
const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
enum bpf_dynptr_type type;
@@ -2755,8 +2826,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
case BPF_DYNPTR_TYPE_RINGBUF:
return ptr->data + ptr->offset + offset;
case BPF_DYNPTR_TYPE_SKB:
- if (buffer__opt)
- return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt);
+ if (buffer__nullable)
+ return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__nullable);
else
return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len);
case BPF_DYNPTR_TYPE_XDP:
@@ -2765,16 +2836,16 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
if (!IS_ERR_OR_NULL(xdp_ptr))
return xdp_ptr;
- if (!buffer__opt)
+ if (!buffer__nullable)
return NULL;
- bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false);
- return buffer__opt;
+ bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__nullable, len, false);
+ return buffer__nullable;
}
case BPF_DYNPTR_TYPE_SKB_META:
return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset);
case BPF_DYNPTR_TYPE_FILE:
- err = bpf_file_fetch_bytes(ptr->data, offset, buffer__opt, buffer__szk);
- return err ? NULL : buffer__opt;
+ err = bpf_file_fetch_bytes(ptr->data, offset, buffer__nullable, buffer__szk);
+ return err ? NULL : buffer__nullable;
default:
WARN_ONCE(true, "unknown dynptr type %d\n", type);
return NULL;
@@ -2785,14 +2856,14 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
* bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
* @p: The dynptr whose data slice to retrieve
* @offset: Offset into the dynptr
- * @buffer__opt: User-provided buffer to copy contents into. May be NULL
+ * @buffer__nullable: User-provided buffer to copy contents into. May be NULL
* @buffer__szk: Size (in bytes) of the buffer if present. This is the
* length of the requested slice. This must be a constant.
*
* For non-skb and non-xdp type dynptrs, there is no difference between
* bpf_dynptr_slice and bpf_dynptr_data.
*
- * If buffer__opt is NULL, the call will fail if buffer_opt was needed.
+ * If buffer__nullable is NULL, the call will fail if buffer_opt was needed.
*
* The returned pointer is writable and may point to either directly the dynptr
* data at the requested offset or to the buffer if unable to obtain a direct
@@ -2824,7 +2895,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
* direct pointer)
*/
__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
- void *buffer__opt, u64 buffer__szk)
+ void *buffer__nullable, u64 buffer__szk)
{
const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
@@ -2853,7 +2924,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
* will be copied out into the buffer and the user will need to call
* bpf_dynptr_write() to commit changes.
*/
- return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk);
+ return bpf_dynptr_slice(p, offset, buffer__nullable, buffer__szk);
}
__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end)
@@ -3108,30 +3179,36 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
struct bpf_work *w;
- if (in_nmi())
- return -EOPNOTSUPP;
if (flags)
return -EINVAL;
+
w = READ_ONCE(async->work);
if (!w || !READ_ONCE(w->cb.prog))
return -EINVAL;
- schedule_work(&w->work);
- return 0;
+ if (!refcount_inc_not_zero(&w->cb.refcnt))
+ return -ENOENT;
+
+ if (!defer_timer_wq_op()) {
+ schedule_work(&w->work);
+ bpf_async_refcount_put(&w->cb);
+ return 0;
+ } else {
+ return bpf_async_schedule_op(&w->cb, BPF_ASYNC_START, 0, 0);
+ }
}
-__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq,
- int (callback_fn)(void *map, int *key, void *value),
- unsigned int flags,
- void *aux__prog)
+__bpf_kfunc int bpf_wq_set_callback(struct bpf_wq *wq,
+ int (callback_fn)(void *map, int *key, void *value),
+ unsigned int flags,
+ struct bpf_prog_aux *aux)
{
- struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog;
struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
if (flags)
return -EINVAL;
- return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ);
+ return __bpf_async_set_callback(async, callback_fn, aux->prog);
}
__bpf_kfunc void bpf_preempt_disable(void)
@@ -3406,7 +3483,7 @@ __bpf_kfunc void __bpf_trap(void)
* __get_kernel_nofault instead of plain dereference to make them safe.
*/
-static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case)
+static int __bpf_strncasecmp(const char *s1, const char *s2, bool ignore_case, size_t len)
{
char c1, c2;
int i;
@@ -3417,7 +3494,7 @@ static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case)
}
guard(pagefault)();
- for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ for (i = 0; i < len && i < XATTR_SIZE_MAX; i++) {
__get_kernel_nofault(&c1, s1, char, err_out);
__get_kernel_nofault(&c2, s2, char, err_out);
if (ignore_case) {
@@ -3431,7 +3508,7 @@ static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case)
s1++;
s2++;
}
- return -E2BIG;
+ return i == XATTR_SIZE_MAX ? -E2BIG : 0;
err_out:
return -EFAULT;
}
@@ -3451,7 +3528,7 @@ err_out:
*/
__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
{
- return __bpf_strcasecmp(s1__ign, s2__ign, false);
+ return __bpf_strncasecmp(s1__ign, s2__ign, false, XATTR_SIZE_MAX);
}
/**
@@ -3469,7 +3546,26 @@ __bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
*/
__bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign)
{
- return __bpf_strcasecmp(s1__ign, s2__ign, true);
+ return __bpf_strncasecmp(s1__ign, s2__ign, true, XATTR_SIZE_MAX);
+}
+
+/*
+ * bpf_strncasecmp - Compare two length-limited strings, ignoring case
+ * @s1__ign: One string
+ * @s2__ign: Another string
+ * @len: The maximum number of characters to compare
+ *
+ * Return:
+ * * %0 - Strings are equal
+ * * %-1 - @s1__ign is smaller
+ * * %1 - @s2__ign is smaller
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of strings is too large
+ * * %-ERANGE - One of strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strncasecmp(const char *s1__ign, const char *s2__ign, size_t len)
+{
+ return __bpf_strncasecmp(s1__ign, s2__ign, true, len);
}
/**
@@ -4275,41 +4371,39 @@ release_prog:
}
/**
- * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL
+ * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL
* mode
* @task: Task struct for which callback should be scheduled
* @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
* @map__map: bpf_map that embeds struct bpf_task_work in the values
* @callback: pointer to BPF subprogram to call
- * @aux__prog: user should pass NULL
+ * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier
*
* Return: 0 if task work has been scheduled successfully, negative error code otherwise
*/
-__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task,
- struct bpf_task_work *tw, void *map__map,
- bpf_task_work_callback_t callback,
- void *aux__prog)
+__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw,
+ void *map__map, bpf_task_work_callback_t callback,
+ struct bpf_prog_aux *aux)
{
- return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL);
+ return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_SIGNAL);
}
/**
- * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME
+ * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME
* mode
* @task: Task struct for which callback should be scheduled
* @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
* @map__map: bpf_map that embeds struct bpf_task_work in the values
* @callback: pointer to BPF subprogram to call
- * @aux__prog: user should pass NULL
+ * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier
*
* Return: 0 if task work has been scheduled successfully, negative error code otherwise
*/
-__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task,
- struct bpf_task_work *tw, void *map__map,
- bpf_task_work_callback_t callback,
- void *aux__prog)
+__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw,
+ void *map__map, bpf_task_work_callback_t callback,
+ struct bpf_prog_aux *aux)
{
- return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME);
+ return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_RESUME);
}
static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep,
@@ -4360,6 +4454,53 @@ __bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr)
return 0;
}
+/**
+ * bpf_timer_cancel_async - try to deactivate a timer
+ * @timer: bpf_timer to stop
+ *
+ * Returns:
+ *
+ * * 0 when the timer was not active
+ * * 1 when the timer was active
+ * * -1 when the timer is currently executing the callback function and
+ * cannot be stopped
+ * * -ECANCELED when the timer will be cancelled asynchronously
+ * * -ENOMEM when out of memory
+ * * -EINVAL when the timer was not initialized
+ * * -ENOENT when this kfunc is racing with timer deletion
+ */
+__bpf_kfunc int bpf_timer_cancel_async(struct bpf_timer *timer)
+{
+ struct bpf_async_kern *async = (void *)timer;
+ struct bpf_async_cb *cb;
+ int ret;
+
+ cb = READ_ONCE(async->cb);
+ if (!cb)
+ return -EINVAL;
+
+ /*
+ * Unlike hrtimer_start() it's ok to synchronously call
+ * hrtimer_try_to_cancel() when refcnt reached zero, but deferring to
+ * irq_work is not, since irq callback may execute after RCU GP and
+ * cb could be freed at that time. Check for refcnt zero for
+ * consistency.
+ */
+ if (!refcount_inc_not_zero(&cb->refcnt))
+ return -ENOENT;
+
+ if (!defer_timer_wq_op()) {
+ struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);
+
+ ret = hrtimer_try_to_cancel(&t->timer);
+ bpf_async_refcount_put(cb);
+ return ret;
+ } else {
+ ret = bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0);
+ return ret ? ret : -ECANCELED;
+ }
+}
+
__bpf_kfunc_end_defs();
static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work)
@@ -4427,7 +4568,7 @@ BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_throw)
#ifdef CONFIG_BPF_EVENTS
-BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_send_signal_task)
#endif
#ifdef CONFIG_KEYS
BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
@@ -4467,14 +4608,14 @@ BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU)
BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
#ifdef CONFIG_CGROUPS
-BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
-BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_RCU_PROTECTED)
BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
#endif
-BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_RCU_PROTECTED)
BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
BTF_ID_FLAGS(func, bpf_dynptr_adjust)
@@ -4488,7 +4629,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_memset)
BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
#endif
BTF_ID_FLAGS(func, bpf_wq_init)
-BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)
+BTF_ID_FLAGS(func, bpf_wq_set_callback, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_wq_start)
BTF_ID_FLAGS(func, bpf_preempt_disable)
BTF_ID_FLAGS(func, bpf_preempt_enable)
@@ -4510,8 +4651,8 @@ BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr)
BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr)
BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE)
#endif
#ifdef CONFIG_DMA_SHARED_BUFFER
BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE)
@@ -4521,6 +4662,7 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
BTF_ID_FLAGS(func, __bpf_trap)
BTF_ID_FLAGS(func, bpf_strcmp);
BTF_ID_FLAGS(func, bpf_strcasecmp);
+BTF_ID_FLAGS(func, bpf_strncasecmp);
BTF_ID_FLAGS(func, bpf_strchr);
BTF_ID_FLAGS(func, bpf_strchrnul);
BTF_ID_FLAGS(func, bpf_strnchr);
@@ -4536,11 +4678,13 @@ BTF_ID_FLAGS(func, bpf_strncasestr);
#if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
#endif
-BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, bpf_stream_print_stack, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, bpf_dynptr_from_file)
BTF_ID_FLAGS(func, bpf_dynptr_file_discard)
+BTF_ID_FLAGS(func, bpf_timer_cancel_async)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 9f866a010dad..005ea3a2cda7 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -600,10 +600,17 @@ struct bpffs_btf_enums {
static int find_bpffs_btf_enums(struct bpffs_btf_enums *info)
{
+ struct {
+ const struct btf_type **type;
+ const char *name;
+ } btf_enums[] = {
+ {&info->cmd_t, "bpf_cmd"},
+ {&info->map_t, "bpf_map_type"},
+ {&info->prog_t, "bpf_prog_type"},
+ {&info->attach_t, "bpf_attach_type"},
+ };
const struct btf *btf;
- const struct btf_type *t;
- const char *name;
- int i, n;
+ int i, id;
memset(info, 0, sizeof(*info));
@@ -615,31 +622,16 @@ static int find_bpffs_btf_enums(struct bpffs_btf_enums *info)
info->btf = btf;
- for (i = 1, n = btf_nr_types(btf); i < n; i++) {
- t = btf_type_by_id(btf, i);
- if (!btf_type_is_enum(t))
- continue;
-
- name = btf_name_by_offset(btf, t->name_off);
- if (!name)
- continue;
-
- if (strcmp(name, "bpf_cmd") == 0)
- info->cmd_t = t;
- else if (strcmp(name, "bpf_map_type") == 0)
- info->map_t = t;
- else if (strcmp(name, "bpf_prog_type") == 0)
- info->prog_t = t;
- else if (strcmp(name, "bpf_attach_type") == 0)
- info->attach_t = t;
- else
- continue;
+ for (i = 0; i < ARRAY_SIZE(btf_enums); i++) {
+ id = btf_find_by_name_kind(btf, btf_enums[i].name,
+ BTF_KIND_ENUM);
+ if (id < 0)
+ return -ESRCH;
- if (info->cmd_t && info->map_t && info->prog_t && info->attach_t)
- return 0;
+ *btf_enums[i].type = btf_type_by_id(btf, id);
}
- return -ESRCH;
+ return 0;
}
static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t,
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index c93a756e035c..1ccbf28b2ad9 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -180,7 +180,7 @@ static long cgroup_storage_update_elem(struct bpf_map *map, void *key,
}
int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key,
- void *value)
+ void *value, u64 map_flags)
{
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
struct bpf_cgroup_storage *storage;
@@ -198,12 +198,17 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key,
* access 'value_size' of them, so copying rounded areas
* will not leak any kernel data
*/
+ if (map_flags & BPF_F_CPU) {
+ cpu = map_flags >> 32;
+ copy_map_value(_map, value, per_cpu_ptr(storage->percpu_buf, cpu));
+ goto unlock;
+ }
size = round_up(_map->value_size, 8);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(value + off,
- per_cpu_ptr(storage->percpu_buf, cpu), size);
+ copy_map_value_long(_map, value + off, per_cpu_ptr(storage->percpu_buf, cpu));
off += size;
}
+unlock:
rcu_read_unlock();
return 0;
}
@@ -213,10 +218,11 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key,
{
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
struct bpf_cgroup_storage *storage;
- int cpu, off = 0;
+ void *val;
u32 size;
+ int cpu;
- if (map_flags != BPF_ANY && map_flags != BPF_EXIST)
+ if ((u32)map_flags & ~(BPF_ANY | BPF_EXIST | BPF_F_CPU | BPF_F_ALL_CPUS))
return -EINVAL;
rcu_read_lock();
@@ -232,12 +238,17 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key,
* returned or zeros which were zero-filled by percpu_alloc,
* so no kernel data leaks possible
*/
+ if (map_flags & BPF_F_CPU) {
+ cpu = map_flags >> 32;
+ copy_map_value(_map, per_cpu_ptr(storage->percpu_buf, cpu), value);
+ goto unlock;
+ }
size = round_up(_map->value_size, 8);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu),
- value + off, size);
- off += size;
+ val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
+ copy_map_value(_map, per_cpu_ptr(storage->percpu_buf, cpu), val);
}
+unlock:
rcu_read_unlock();
return 0;
}
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 9575314f40a6..261a03ea73d3 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -214,7 +214,7 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(bpf_map_iter_kfunc_ids)
-BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_map_sum_elem_count)
BTF_KFUNCS_END(bpf_map_iter_kfunc_ids)
static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 42ae8d595c2c..227f9b5f388b 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -1,16 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2017-2018 Netronome Systems, Inc.
- *
- * This software is licensed under the GNU General License Version 2,
- * June 1991 as shown in the file COPYING in the top-level directory of this
- * source tree.
- *
- * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS"
- * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
- * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE
- * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
- * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
*/
#include <linux/bpf.h>
diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c
index 99c63d982c5d..2f28886f3ff7 100644
--- a/kernel/bpf/range_tree.c
+++ b/kernel/bpf/range_tree.c
@@ -149,7 +149,8 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len)
range_it_insert(rn, rt);
/* Add a range */
- new_rn = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
+ new_rn = kmalloc_nolock(sizeof(struct range_node), __GFP_ACCOUNT,
+ NUMA_NO_NODE);
if (!new_rn)
return -ENOMEM;
new_rn->rn_start = last + 1;
@@ -234,7 +235,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len)
right->rn_start = start;
range_it_insert(right, rt);
} else {
- left = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
+ left = kmalloc_nolock(sizeof(struct range_node), __GFP_ACCOUNT, NUMA_NO_NODE);
if (!left)
return -ENOMEM;
left->rn_start = start;
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index f6a075ffac63..35ae64ade36b 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/err.h>
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index f7d0c8d4644e..e4e338cdb437 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -265,10 +265,11 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
RES_INIT_TIMEOUT(ts);
/*
- * The fast path is not invoked for the TAS fallback, so we must grab
- * the deadlock detection entry here.
+ * We are either called directly from res_spin_lock after grabbing the
+ * deadlock detection entry when queued spinlocks are disabled, or from
+ * resilient_queued_spin_lock_slowpath after grabbing the deadlock
+ * detection entry. No need to obtain it here.
*/
- grab_held_lock_entry(lock);
/*
* Since the waiting loop's time is dependent on the amount of
@@ -694,7 +695,6 @@ __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock)
int ret;
BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock));
- BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock));
preempt_disable();
ret = res_spin_lock((rqspinlock_t *)lock);
diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c
index 0b6bc3f30335..be9ce98e9469 100644
--- a/kernel/bpf/stream.c
+++ b/kernel/bpf/stream.c
@@ -212,14 +212,13 @@ __bpf_kfunc_start_defs();
* Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the
* enum in headers.
*/
-__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args,
- u32 len__sz, void *aux__prog)
+__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args,
+ u32 len__sz, struct bpf_prog_aux *aux)
{
struct bpf_bprintf_data data = {
.get_bin_args = true,
.get_buf = true,
};
- struct bpf_prog_aux *aux = aux__prog;
u32 fmt_size = strlen(fmt__str) + 1;
struct bpf_stream *stream;
u32 data_len = len__sz;
@@ -246,6 +245,25 @@ __bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, con
return ret;
}
+/* Directly trigger a stack dump from the program. */
+__bpf_kfunc int bpf_stream_print_stack(int stream_id, struct bpf_prog_aux *aux)
+{
+ struct bpf_stream_stage ss;
+ struct bpf_prog *prog;
+
+ /* Make sure the stream ID is valid. */
+ if (!bpf_stream_get(stream_id, aux))
+ return -ENOENT;
+
+ prog = aux->main_prog_aux->prog;
+
+ bpf_stream_stage(ss, prog, stream_id, ({
+ bpf_stream_dump_stack(ss);
+ }));
+
+ return 0;
+}
+
__bpf_kfunc_end_defs();
/* Added kfunc to common_btf_ids */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4ff82144f885..dd89bf809772 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -9,6 +9,7 @@
#include <linux/bpf_verifier.h>
#include <linux/bsearch.h>
#include <linux/btf.h>
+#include <linux/hex.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
@@ -133,12 +134,14 @@ bool bpf_map_write_active(const struct bpf_map *map)
return atomic64_read(&map->writecnt) != 0;
}
-static u32 bpf_map_value_size(const struct bpf_map *map)
+static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags)
{
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
- map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+ if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS))
+ return map->value_size;
+ else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
return round_up(map->value_size, 8) * num_possible_cpus();
else if (IS_FD_MAP(map))
return sizeof(u32);
@@ -314,11 +317,11 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
bpf_disable_instrumentation();
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- err = bpf_percpu_hash_copy(map, key, value);
+ err = bpf_percpu_hash_copy(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- err = bpf_percpu_array_copy(map, key, value);
+ err = bpf_percpu_array_copy(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
- err = bpf_percpu_cgroup_storage_copy(map, key, value);
+ err = bpf_percpu_cgroup_storage_copy(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_extract(map, key, value, false);
} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
@@ -505,17 +508,29 @@ static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
return root_mem_cgroup;
}
+void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg,
+ struct mem_cgroup **new_memcg)
+{
+ *new_memcg = bpf_map_get_memcg(map);
+ *old_memcg = set_active_memcg(*new_memcg);
+}
+
+void bpf_map_memcg_exit(struct mem_cgroup *old_memcg,
+ struct mem_cgroup *new_memcg)
+{
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(new_memcg);
+}
+
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
int node)
{
struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
+ bpf_map_memcg_enter(map, &old_memcg, &memcg);
ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
+ bpf_map_memcg_exit(old_memcg, memcg);
return ptr;
}
@@ -526,11 +541,9 @@ void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags
struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
+ bpf_map_memcg_enter(map, &old_memcg, &memcg);
ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node);
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
+ bpf_map_memcg_exit(old_memcg, memcg);
return ptr;
}
@@ -540,11 +553,9 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
+ bpf_map_memcg_enter(map, &old_memcg, &memcg);
ptr = kzalloc(size, flags | __GFP_ACCOUNT);
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
+ bpf_map_memcg_exit(old_memcg, memcg);
return ptr;
}
@@ -555,11 +566,9 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
+ bpf_map_memcg_enter(map, &old_memcg, &memcg);
ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
+ bpf_map_memcg_exit(old_memcg, memcg);
return ptr;
}
@@ -570,11 +579,9 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
struct mem_cgroup *memcg, *old_memcg;
void __percpu *ptr;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
+ bpf_map_memcg_enter(map, &old_memcg, &memcg);
ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
+ bpf_map_memcg_exit(old_memcg, memcg);
return ptr;
}
@@ -612,12 +619,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
unsigned long i, j;
struct page *pg;
int ret = 0;
-#ifdef CONFIG_MEMCG
- struct mem_cgroup *memcg, *old_memcg;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
-#endif
for (i = 0; i < nr_pages; i++) {
pg = __bpf_alloc_page(nid);
@@ -631,10 +633,6 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
break;
}
-#ifdef CONFIG_MEMCG
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
-#endif
return ret;
}
@@ -1366,11 +1364,6 @@ free_map_tab:
return ret;
}
-static bool bpf_net_capable(void)
-{
- return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
-}
-
#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size
/* called via syscall */
static int map_create(union bpf_attr *attr, bpfptr_t uattr)
@@ -1734,7 +1727,7 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
return -EPERM;
- err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK);
+ err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU);
if (err)
return err;
@@ -1742,7 +1735,7 @@ static int map_lookup_elem(union bpf_attr *attr)
if (IS_ERR(key))
return PTR_ERR(key);
- value_size = bpf_map_value_size(map);
+ value_size = bpf_map_value_size(map, attr->flags);
err = -ENOMEM;
value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
@@ -1809,7 +1802,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
goto err_put;
}
- value_size = bpf_map_value_size(map);
+ value_size = bpf_map_value_size(map, attr->flags);
value = kvmemdup_bpfptr(uvalue, value_size);
if (IS_ERR(value)) {
err = PTR_ERR(value);
@@ -2005,11 +1998,12 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
void *key, *value;
int err = 0;
- err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+ err = bpf_map_check_op_flags(map, attr->batch.elem_flags,
+ BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS);
if (err)
return err;
- value_size = bpf_map_value_size(map);
+ value_size = bpf_map_value_size(map, attr->batch.elem_flags);
max_count = attr->batch.count;
if (!max_count)
@@ -2064,11 +2058,11 @@ int generic_map_lookup_batch(struct bpf_map *map,
u32 value_size, cp, max_count;
int err;
- err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+ err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU);
if (err)
return err;
- value_size = bpf_map_value_size(map);
+ value_size = bpf_map_value_size(map, attr->batch.elem_flags);
max_count = attr->batch.count;
if (!max_count)
@@ -2190,7 +2184,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
goto err_put;
}
- value_size = bpf_map_value_size(map);
+ value_size = bpf_map_value_size(map, 0);
err = -ENOMEM;
value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
@@ -2820,6 +2814,13 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr
void *sig;
int err = 0;
+ /*
+ * Don't attempt to use kmalloc_large or vmalloc for signatures.
+ * Practical signature for BPF program should be below this limit.
+ */
+ if (attr->signature_size > KMALLOC_MAX_CACHE_SIZE)
+ return -EINVAL;
+
if (system_keyring_id_check(attr->keyring_id) == 0)
key = bpf_lookup_system_key(attr->keyring_id);
else
@@ -3579,6 +3580,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
case BPF_PROG_TYPE_TRACING:
if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
prog->expected_attach_type != BPF_TRACE_FEXIT &&
+ prog->expected_attach_type != BPF_TRACE_FSESSION &&
prog->expected_attach_type != BPF_MODIFY_RETURN) {
err = -EINVAL;
goto out_put_prog;
@@ -3628,7 +3630,21 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
}
- link = kzalloc(sizeof(*link), GFP_USER);
+ if (prog->expected_attach_type == BPF_TRACE_FSESSION) {
+ struct bpf_fsession_link *fslink;
+
+ fslink = kzalloc(sizeof(*fslink), GFP_USER);
+ if (fslink) {
+ bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING,
+ &bpf_tracing_link_lops, prog, attach_type);
+ fslink->fexit.cookie = bpf_cookie;
+ link = &fslink->link;
+ } else {
+ link = NULL;
+ }
+ } else {
+ link = kzalloc(sizeof(*link), GFP_USER);
+ }
if (!link) {
err = -ENOMEM;
goto out_put_prog;
@@ -4352,6 +4368,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_TRACE_RAW_TP:
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
+ case BPF_TRACE_FSESSION:
case BPF_MODIFY_RETURN:
return BPF_PROG_TYPE_TRACING;
case BPF_LSM_MAC:
@@ -4565,6 +4582,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
if (IS_ERR(prog))
return PTR_ERR(prog);
+ } else if (!bpf_mprog_detach_empty(ptype)) {
+ return -EPERM;
}
} else if (is_cgroup_prog_type(ptype, 0, false)) {
if (attr->attach_flags || attr->relative_fd)
@@ -5310,6 +5329,9 @@ static int bpf_map_get_info_by_fd(struct file *file,
if (info.hash_size != SHA256_DIGEST_SIZE)
return -EINVAL;
+ if (!READ_ONCE(map->frozen))
+ return -EPERM;
+
err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha);
if (err != 0)
return err;
@@ -6122,6 +6144,49 @@ static int prog_stream_read(union bpf_attr *attr)
return ret;
}
+#define BPF_PROG_ASSOC_STRUCT_OPS_LAST_FIELD prog_assoc_struct_ops.prog_fd
+
+static int prog_assoc_struct_ops(union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ struct bpf_map *map;
+ int ret;
+
+ if (CHECK_ATTR(BPF_PROG_ASSOC_STRUCT_OPS))
+ return -EINVAL;
+
+ if (attr->prog_assoc_struct_ops.flags)
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->prog_assoc_struct_ops.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
+ ret = -EINVAL;
+ goto put_prog;
+ }
+
+ map = bpf_map_get(attr->prog_assoc_struct_ops.map_fd);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
+ goto put_prog;
+ }
+
+ if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) {
+ ret = -EINVAL;
+ goto put_map;
+ }
+
+ ret = bpf_prog_assoc_struct_ops(prog, map);
+
+put_map:
+ bpf_map_put(map);
+put_prog:
+ bpf_prog_put(prog);
+ return ret;
+}
+
static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
@@ -6261,6 +6326,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
case BPF_PROG_STREAM_READ_BY_FD:
err = prog_stream_read(&attr);
break;
+ case BPF_PROG_ASSOC_STRUCT_OPS:
+ err = prog_assoc_struct_ops(&attr);
+ break;
default:
err = -EINVAL;
break;
@@ -6407,7 +6475,7 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
.func = bpf_kallsyms_lookup_name,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index f8e70e9c3998..26fbfbb01700 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -8,6 +8,7 @@
*/
#include <linux/kernel.h>
#include <linux/tnum.h>
+#include <linux/swab.h>
#define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m}
/* A completely unknown value */
@@ -253,3 +254,18 @@ struct tnum tnum_const_subreg(struct tnum a, u32 value)
{
return tnum_with_subreg(a, tnum_const(value));
}
+
+struct tnum tnum_bswap16(struct tnum a)
+{
+ return TNUM(swab16(a.value & 0xFFFF), swab16(a.mask & 0xFFFF));
+}
+
+struct tnum tnum_bswap32(struct tnum a)
+{
+ return TNUM(swab32(a.value & 0xFFFFFFFF), swab32(a.mask & 0xFFFFFFFF));
+}
+
+struct tnum tnum_bswap64(struct tnum a)
+{
+ return TNUM(swab64(a.value), swab64(a.mask));
+}
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index feecd8f4dbf9..7e4aa1e44b50 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include <linux/vmalloc.h>
#include <linux/file.h>
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 976d89011b15..952cd7932461 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -24,19 +24,49 @@ const struct bpf_prog_ops bpf_extension_prog_ops = {
#define TRAMPOLINE_HASH_BITS 10
#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
-static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
+static struct hlist_head trampoline_key_table[TRAMPOLINE_TABLE_SIZE];
+static struct hlist_head trampoline_ip_table[TRAMPOLINE_TABLE_SIZE];
-/* serializes access to trampoline_table */
+/* serializes access to trampoline tables */
static DEFINE_MUTEX(trampoline_mutex);
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
-static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
+#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS
+static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip)
{
- struct bpf_trampoline *tr = ops->private;
+ struct hlist_head *head_ip;
+ struct bpf_trampoline *tr;
+
+ mutex_lock(&trampoline_mutex);
+ head_ip = &trampoline_ip_table[hash_64(ip, TRAMPOLINE_HASH_BITS)];
+ hlist_for_each_entry(tr, head_ip, hlist_ip) {
+ if (tr->ip == ip)
+ goto out;
+ }
+ tr = NULL;
+out:
+ mutex_unlock(&trampoline_mutex);
+ return tr;
+}
+#else
+static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip)
+{
+ return ops->private;
+}
+#endif /* CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */
+
+static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip,
+ enum ftrace_ops_cmd cmd)
+{
+ struct bpf_trampoline *tr;
int ret = 0;
+ tr = direct_ops_ip_lookup(ops, ip);
+ if (!tr)
+ return -EINVAL;
+
if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) {
/* This is called inside register_ftrace_direct_multi(), so
* tr->mutex is already locked.
@@ -109,10 +139,17 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
enum bpf_attach_type eatype = prog->expected_attach_type;
enum bpf_prog_type ptype = prog->type;
- return (ptype == BPF_PROG_TYPE_TRACING &&
- (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
- eatype == BPF_MODIFY_RETURN)) ||
- (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
+ switch (ptype) {
+ case BPF_PROG_TYPE_TRACING:
+ if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION)
+ return true;
+ return false;
+ case BPF_PROG_TYPE_LSM:
+ return eatype == BPF_LSM_MAC;
+ default:
+ return false;
+ }
}
void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym)
@@ -135,15 +172,171 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym)
PAGE_SIZE, true, ksym->name);
}
-static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS
+/*
+ * We have only single direct_ops which contains all the direct call
+ * sites and is the only global ftrace_ops for all trampolines.
+ *
+ * We use 'update_ftrace_direct_*' api for attachment.
+ */
+struct ftrace_ops direct_ops = {
+ .ops_func = bpf_tramp_ftrace_ops_func,
+};
+
+static int direct_ops_alloc(struct bpf_trampoline *tr)
+{
+ tr->fops = &direct_ops;
+ return 0;
+}
+
+static void direct_ops_free(struct bpf_trampoline *tr) { }
+
+static struct ftrace_hash *hash_from_ip(struct bpf_trampoline *tr, void *ptr)
+{
+ unsigned long ip, addr = (unsigned long) ptr;
+ struct ftrace_hash *hash;
+
+ ip = ftrace_location(tr->ip);
+ if (!ip)
+ return NULL;
+ hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+ if (!hash)
+ return NULL;
+ if (bpf_trampoline_use_jmp(tr->flags))
+ addr = ftrace_jmp_set(addr);
+ if (!add_ftrace_hash_entry_direct(hash, ip, addr)) {
+ free_ftrace_hash(hash);
+ return NULL;
+ }
+ return hash;
+}
+
+static int direct_ops_add(struct bpf_trampoline *tr, void *addr)
+{
+ struct ftrace_hash *hash = hash_from_ip(tr, addr);
+ int err;
+
+ if (!hash)
+ return -ENOMEM;
+ err = update_ftrace_direct_add(tr->fops, hash);
+ free_ftrace_hash(hash);
+ return err;
+}
+
+static int direct_ops_del(struct bpf_trampoline *tr, void *addr)
+{
+ struct ftrace_hash *hash = hash_from_ip(tr, addr);
+ int err;
+
+ if (!hash)
+ return -ENOMEM;
+ err = update_ftrace_direct_del(tr->fops, hash);
+ free_ftrace_hash(hash);
+ return err;
+}
+
+static int direct_ops_mod(struct bpf_trampoline *tr, void *addr, bool lock_direct_mutex)
+{
+ struct ftrace_hash *hash = hash_from_ip(tr, addr);
+ int err;
+
+ if (!hash)
+ return -ENOMEM;
+ err = update_ftrace_direct_mod(tr->fops, hash, lock_direct_mutex);
+ free_ftrace_hash(hash);
+ return err;
+}
+#else
+/*
+ * We allocate ftrace_ops object for each trampoline and it contains
+ * call site specific for that trampoline.
+ *
+ * We use *_ftrace_direct api for attachment.
+ */
+static int direct_ops_alloc(struct bpf_trampoline *tr)
+{
+ tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
+ if (!tr->fops)
+ return -ENOMEM;
+ tr->fops->private = tr;
+ tr->fops->ops_func = bpf_tramp_ftrace_ops_func;
+ return 0;
+}
+
+static void direct_ops_free(struct bpf_trampoline *tr)
+{
+ if (!tr->fops)
+ return;
+ ftrace_free_filter(tr->fops);
+ kfree(tr->fops);
+}
+
+static int direct_ops_add(struct bpf_trampoline *tr, void *ptr)
+{
+ unsigned long addr = (unsigned long) ptr;
+ struct ftrace_ops *ops = tr->fops;
+ int ret;
+
+ if (bpf_trampoline_use_jmp(tr->flags))
+ addr = ftrace_jmp_set(addr);
+
+ ret = ftrace_set_filter_ip(ops, tr->ip, 0, 1);
+ if (ret)
+ return ret;
+ return register_ftrace_direct(ops, addr);
+}
+
+static int direct_ops_del(struct bpf_trampoline *tr, void *addr)
+{
+ return unregister_ftrace_direct(tr->fops, (long)addr, false);
+}
+
+static int direct_ops_mod(struct bpf_trampoline *tr, void *ptr, bool lock_direct_mutex)
+{
+ unsigned long addr = (unsigned long) ptr;
+ struct ftrace_ops *ops = tr->fops;
+
+ if (bpf_trampoline_use_jmp(tr->flags))
+ addr = ftrace_jmp_set(addr);
+ if (lock_direct_mutex)
+ return modify_ftrace_direct(ops, addr);
+ return modify_ftrace_direct_nolock(ops, addr);
+}
+#endif /* CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */
+#else
+static void direct_ops_free(struct bpf_trampoline *tr) { }
+
+static int direct_ops_alloc(struct bpf_trampoline *tr)
+{
+ return 0;
+}
+
+static int direct_ops_add(struct bpf_trampoline *tr, void *addr)
+{
+ return -ENODEV;
+}
+
+static int direct_ops_del(struct bpf_trampoline *tr, void *addr)
+{
+ return -ENODEV;
+}
+
+static int direct_ops_mod(struct bpf_trampoline *tr, void *ptr, bool lock_direct_mutex)
+{
+ return -ENODEV;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
+static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip)
{
struct bpf_trampoline *tr;
struct hlist_head *head;
int i;
mutex_lock(&trampoline_mutex);
- head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
- hlist_for_each_entry(tr, head, hlist) {
+ head = &trampoline_key_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
+ hlist_for_each_entry(tr, head, hlist_key) {
if (tr->key == key) {
refcount_inc(&tr->refcnt);
goto out;
@@ -152,20 +345,19 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
tr = kzalloc(sizeof(*tr), GFP_KERNEL);
if (!tr)
goto out;
-#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
- tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
- if (!tr->fops) {
+ if (direct_ops_alloc(tr)) {
kfree(tr);
tr = NULL;
goto out;
}
- tr->fops->private = tr;
- tr->fops->ops_func = bpf_tramp_ftrace_ops_func;
-#endif
tr->key = key;
- INIT_HLIST_NODE(&tr->hlist);
- hlist_add_head(&tr->hlist, head);
+ tr->ip = ftrace_location(ip);
+ INIT_HLIST_NODE(&tr->hlist_key);
+ INIT_HLIST_NODE(&tr->hlist_ip);
+ hlist_add_head(&tr->hlist_key, head);
+ head = &trampoline_ip_table[hash_64(tr->ip, TRAMPOLINE_HASH_BITS)];
+ hlist_add_head(&tr->hlist_ip, head);
refcount_set(&tr->refcnt, 1);
mutex_init(&tr->mutex);
for (i = 0; i < BPF_TRAMP_MAX; i++)
@@ -200,7 +392,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags,
int ret;
if (tr->func.ftrace_managed)
- ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false);
+ ret = direct_ops_del(tr, old_addr);
else
ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL);
@@ -214,10 +406,7 @@ static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags,
int ret;
if (tr->func.ftrace_managed) {
- if (lock_direct_mutex)
- ret = modify_ftrace_direct(tr->fops, (long)new_addr);
- else
- ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr);
+ ret = direct_ops_mod(tr, new_addr, lock_direct_mutex);
} else {
ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr,
new_addr);
@@ -240,10 +429,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
}
if (tr->func.ftrace_managed) {
- ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
- if (ret)
- return ret;
- ret = register_ftrace_direct(tr->fops, (long)new_addr);
+ ret = direct_ops_add(tr, new_addr);
} else {
ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr);
}
@@ -499,13 +685,6 @@ again:
if (err)
goto out_free;
-#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
- if (bpf_trampoline_use_jmp(tr->flags))
- tr->fops->flags |= FTRACE_OPS_FL_JMP;
- else
- tr->fops->flags &= ~FTRACE_OPS_FL_JMP;
-#endif
-
WARN_ON(tr->cur_image && total == 0);
if (tr->cur_image)
/* progs already running at this address */
@@ -533,15 +712,8 @@ again:
tr->cur_image = im;
out:
/* If any error happens, restore previous flags */
- if (err) {
+ if (err)
tr->flags = orig_flags;
-#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
- if (bpf_trampoline_use_jmp(tr->flags))
- tr->fops->flags |= FTRACE_OPS_FL_JMP;
- else
- tr->fops->flags &= ~FTRACE_OPS_FL_JMP;
-#endif
- }
kfree(tlinks);
return err;
@@ -559,6 +731,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
return BPF_TRAMP_MODIFY_RETURN;
case BPF_TRACE_FEXIT:
return BPF_TRAMP_FEXIT;
+ case BPF_TRACE_FSESSION:
+ return BPF_TRAMP_FSESSION;
case BPF_LSM_MAC:
if (!prog->aux->attach_func_proto->type)
/* The function returns void, we cannot modify its
@@ -594,8 +768,10 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
struct bpf_trampoline *tr,
struct bpf_prog *tgt_prog)
{
+ struct bpf_fsession_link *fslink = NULL;
enum bpf_tramp_prog_type kind;
struct bpf_tramp_link *link_exiting;
+ struct hlist_head *prog_list;
int err = 0;
int cnt = 0, i;
@@ -621,24 +797,43 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
BPF_MOD_JUMP, NULL,
link->link.prog->bpf_func);
}
+ if (kind == BPF_TRAMP_FSESSION) {
+ prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY];
+ cnt++;
+ } else {
+ prog_list = &tr->progs_hlist[kind];
+ }
if (cnt >= BPF_MAX_TRAMP_LINKS)
return -E2BIG;
if (!hlist_unhashed(&link->tramp_hlist))
/* prog already linked */
return -EBUSY;
- hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) {
+ hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) {
if (link_exiting->link.prog != link->link.prog)
continue;
/* prog already linked */
return -EBUSY;
}
- hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
- tr->progs_cnt[kind]++;
+ hlist_add_head(&link->tramp_hlist, prog_list);
+ if (kind == BPF_TRAMP_FSESSION) {
+ tr->progs_cnt[BPF_TRAMP_FENTRY]++;
+ fslink = container_of(link, struct bpf_fsession_link, link.link);
+ hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]);
+ tr->progs_cnt[BPF_TRAMP_FEXIT]++;
+ } else {
+ tr->progs_cnt[kind]++;
+ }
err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
if (err) {
hlist_del_init(&link->tramp_hlist);
- tr->progs_cnt[kind]--;
+ if (kind == BPF_TRAMP_FSESSION) {
+ tr->progs_cnt[BPF_TRAMP_FENTRY]--;
+ hlist_del_init(&fslink->fexit.tramp_hlist);
+ tr->progs_cnt[BPF_TRAMP_FEXIT]--;
+ } else {
+ tr->progs_cnt[kind]--;
+ }
}
return err;
}
@@ -672,6 +867,13 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
guard(mutex)(&tgt_prog->aux->ext_mutex);
tgt_prog->aux->is_extended = false;
return err;
+ } else if (kind == BPF_TRAMP_FSESSION) {
+ struct bpf_fsession_link *fslink =
+ container_of(link, struct bpf_fsession_link, link.link);
+
+ hlist_del_init(&fslink->fexit.tramp_hlist);
+ tr->progs_cnt[BPF_TRAMP_FEXIT]--;
+ kind = BPF_TRAMP_FENTRY;
}
hlist_del_init(&link->tramp_hlist);
tr->progs_cnt[kind]--;
@@ -850,7 +1052,7 @@ void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
prog->aux->attach_btf_id);
bpf_lsm_find_cgroup_shim(prog, &bpf_func);
- tr = bpf_trampoline_lookup(key);
+ tr = bpf_trampoline_lookup(key, 0);
if (WARN_ON_ONCE(!tr))
return;
@@ -870,7 +1072,7 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key,
{
struct bpf_trampoline *tr;
- tr = bpf_trampoline_lookup(key);
+ tr = bpf_trampoline_lookup(key, tgt_info->tgt_addr);
if (!tr)
return NULL;
@@ -906,11 +1108,9 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
* fexit progs. The fentry-only trampoline will be freed via
* multiple rcu callbacks.
*/
- hlist_del(&tr->hlist);
- if (tr->fops) {
- ftrace_free_filter(tr->fops);
- kfree(tr->fops);
- }
+ hlist_del(&tr->hlist_key);
+ hlist_del(&tr->hlist_ip);
+ direct_ops_free(tr);
kfree(tr);
out:
mutex_unlock(&trampoline_mutex);
@@ -949,7 +1149,7 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
- if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ if (unlikely(!bpf_prog_get_recursion_context(prog))) {
bpf_prog_inc_misses_counter(prog);
if (prog->aux->recursion_detected)
prog->aux->recursion_detected(prog);
@@ -993,7 +1193,7 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
- this_cpu_dec(*(prog->active));
+ bpf_prog_put_recursion_context(prog);
rcu_read_unlock_migrate();
}
@@ -1029,7 +1229,7 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
- if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ if (unlikely(!bpf_prog_get_recursion_context(prog))) {
bpf_prog_inc_misses_counter(prog);
if (prog->aux->recursion_detected)
prog->aux->recursion_detected(prog);
@@ -1044,7 +1244,7 @@ void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
- this_cpu_dec(*(prog->active));
+ bpf_prog_put_recursion_context(prog);
migrate_enable();
rcu_read_unlock_trace();
}
@@ -1179,7 +1379,9 @@ static int __init init_trampolines(void)
int i;
for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
- INIT_HLIST_HEAD(&trampoline_table[i]);
+ INIT_HLIST_HEAD(&trampoline_key_table[i]);
+ for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
+ INIT_HLIST_HEAD(&trampoline_ip_table[i]);
return 0;
}
late_initcall(init_trampolines);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3135643d5695..edf5342b982f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -272,8 +272,13 @@ static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
}
+struct bpf_map_desc {
+ struct bpf_map *ptr;
+ int uid;
+};
+
struct bpf_call_arg_meta {
- struct bpf_map *map_ptr;
+ struct bpf_map_desc map;
bool raw_mode;
bool pkt_access;
u8 release_regno;
@@ -283,7 +288,6 @@ struct bpf_call_arg_meta {
u64 msize_max_value;
int ref_obj_id;
int dynptr_id;
- int map_uid;
int func_id;
struct btf *btf;
u32 btf_id;
@@ -294,6 +298,14 @@ struct bpf_call_arg_meta {
s64 const_map_key;
};
+struct bpf_kfunc_meta {
+ struct btf *btf;
+ const struct btf_type *proto;
+ const char *name;
+ const u32 *flags;
+ s32 id;
+};
+
struct bpf_kfunc_call_arg_meta {
/* In parameters */
struct btf *btf;
@@ -343,10 +355,7 @@ struct bpf_kfunc_call_arg_meta {
u8 spi;
u8 frameno;
} iter;
- struct {
- struct bpf_map *ptr;
- int uid;
- } map;
+ struct bpf_map_desc map;
u64 mem_size;
};
@@ -512,7 +521,7 @@ static bool is_async_callback_calling_kfunc(u32 btf_id);
static bool is_callback_calling_kfunc(u32 btf_id);
static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
-static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id);
+static bool is_bpf_wq_set_callback_kfunc(u32 btf_id);
static bool is_task_work_add_kfunc(u32 func_id);
static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
@@ -554,7 +563,7 @@ static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn
/* bpf_wq and bpf_task_work callbacks are always sleepable. */
if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
- (is_bpf_wq_set_callback_impl_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm)))
+ (is_bpf_wq_set_callback_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm)))
return true;
verifier_bug(env, "unhandled async callback in is_async_cb_sleepable");
@@ -2341,6 +2350,18 @@ static void __mark_reg32_unbounded(struct bpf_reg_state *reg)
reg->u32_max_value = U32_MAX;
}
+static void reset_reg64_and_tnum(struct bpf_reg_state *reg)
+{
+ __mark_reg64_unbounded(reg);
+ reg->var_off = tnum_unknown;
+}
+
+static void reset_reg32_and_tnum(struct bpf_reg_state *reg)
+{
+ __mark_reg32_unbounded(reg);
+ reg->var_off = tnum_unknown;
+}
+
static void __update_reg32_bounds(struct bpf_reg_state *reg)
{
struct tnum var32_off = tnum_subreg(reg->var_off);
@@ -3263,16 +3284,105 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset)
return btf_vmlinux ?: ERR_PTR(-ENOENT);
}
-static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
+#define KF_IMPL_SUFFIX "_impl"
+
+static const struct btf_type *find_kfunc_impl_proto(struct bpf_verifier_env *env,
+ struct btf *btf,
+ const char *func_name)
+{
+ char *buf = env->tmp_str_buf;
+ const struct btf_type *func;
+ s32 impl_id;
+ int len;
+
+ len = snprintf(buf, TMP_STR_BUF_LEN, "%s%s", func_name, KF_IMPL_SUFFIX);
+ if (len < 0 || len >= TMP_STR_BUF_LEN) {
+ verbose(env, "function name %s%s is too long\n", func_name, KF_IMPL_SUFFIX);
+ return NULL;
+ }
+
+ impl_id = btf_find_by_name_kind(btf, buf, BTF_KIND_FUNC);
+ if (impl_id <= 0) {
+ verbose(env, "cannot find function %s in BTF\n", buf);
+ return NULL;
+ }
+
+ func = btf_type_by_id(btf, impl_id);
+
+ return btf_type_by_id(btf, func->type);
+}
+
+static int fetch_kfunc_meta(struct bpf_verifier_env *env,
+ s32 func_id,
+ s16 offset,
+ struct bpf_kfunc_meta *kfunc)
{
const struct btf_type *func, *func_proto;
+ const char *func_name;
+ u32 *kfunc_flags;
+ struct btf *btf;
+
+ if (func_id <= 0) {
+ verbose(env, "invalid kernel function btf_id %d\n", func_id);
+ return -EINVAL;
+ }
+
+ btf = find_kfunc_desc_btf(env, offset);
+ if (IS_ERR(btf)) {
+ verbose(env, "failed to find BTF for kernel function\n");
+ return PTR_ERR(btf);
+ }
+
+ /*
+ * Note that kfunc_flags may be NULL at this point, which
+ * means that we couldn't find func_id in any relevant
+ * kfunc_id_set. This most likely indicates an invalid kfunc
+ * call. However we don't fail with an error here,
+ * and let the caller decide what to do with NULL kfunc->flags.
+ */
+ kfunc_flags = btf_kfunc_flags(btf, func_id, env->prog);
+
+ func = btf_type_by_id(btf, func_id);
+ if (!func || !btf_type_is_func(func)) {
+ verbose(env, "kernel btf_id %d is not a function\n", func_id);
+ return -EINVAL;
+ }
+
+ func_name = btf_name_by_offset(btf, func->name_off);
+
+ /*
+ * An actual prototype of a kfunc with KF_IMPLICIT_ARGS flag
+ * can be found through the counterpart _impl kfunc.
+ */
+ if (kfunc_flags && (*kfunc_flags & KF_IMPLICIT_ARGS))
+ func_proto = find_kfunc_impl_proto(env, btf, func_name);
+ else
+ func_proto = btf_type_by_id(btf, func->type);
+
+ if (!func_proto || !btf_type_is_func_proto(func_proto)) {
+ verbose(env, "kernel function btf_id %d does not have a valid func_proto\n",
+ func_id);
+ return -EINVAL;
+ }
+
+ memset(kfunc, 0, sizeof(*kfunc));
+ kfunc->btf = btf;
+ kfunc->id = func_id;
+ kfunc->name = func_name;
+ kfunc->proto = func_proto;
+ kfunc->flags = kfunc_flags;
+
+ return 0;
+}
+
+static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
+{
struct bpf_kfunc_btf_tab *btf_tab;
struct btf_func_model func_model;
struct bpf_kfunc_desc_tab *tab;
struct bpf_prog_aux *prog_aux;
+ struct bpf_kfunc_meta kfunc;
struct bpf_kfunc_desc *desc;
- const char *func_name;
- struct btf *desc_btf;
unsigned long addr;
int err;
@@ -3322,12 +3432,6 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
prog_aux->kfunc_btf_tab = btf_tab;
}
- desc_btf = find_kfunc_desc_btf(env, offset);
- if (IS_ERR(desc_btf)) {
- verbose(env, "failed to find BTF for kernel function\n");
- return PTR_ERR(desc_btf);
- }
-
if (find_kfunc_desc(env->prog, func_id, offset))
return 0;
@@ -3336,24 +3440,13 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return -E2BIG;
}
- func = btf_type_by_id(desc_btf, func_id);
- if (!func || !btf_type_is_func(func)) {
- verbose(env, "kernel btf_id %u is not a function\n",
- func_id);
- return -EINVAL;
- }
- func_proto = btf_type_by_id(desc_btf, func->type);
- if (!func_proto || !btf_type_is_func_proto(func_proto)) {
- verbose(env, "kernel function btf_id %u does not have a valid func_proto\n",
- func_id);
- return -EINVAL;
- }
+ err = fetch_kfunc_meta(env, func_id, offset, &kfunc);
+ if (err)
+ return err;
- func_name = btf_name_by_offset(desc_btf, func->name_off);
- addr = kallsyms_lookup_name(func_name);
+ addr = kallsyms_lookup_name(kfunc.name);
if (!addr) {
- verbose(env, "cannot find address for kernel function %s\n",
- func_name);
+ verbose(env, "cannot find address for kernel function %s\n", kfunc.name);
return -EINVAL;
}
@@ -3363,9 +3456,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return err;
}
- err = btf_distill_func_proto(&env->log, desc_btf,
- func_proto, func_name,
- &func_model);
+ err = btf_distill_func_proto(&env->log, kfunc.btf, kfunc.proto, kfunc.name, &func_model);
if (err)
return err;
@@ -5427,6 +5518,12 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
*/
s32 subreg_def = state->regs[dst_regno].subreg_def;
+ if (env->bpf_capable && size == 4 && spill_size == 4 &&
+ get_reg_width(reg) <= 32)
+ /* Ensure stack slot has an ID to build a relation
+ * with the destination register on fill.
+ */
+ assign_scalar_id_before_mov(env, reg);
copy_register_state(&state->regs[dst_regno], reg);
state->regs[dst_regno].subreg_def = subreg_def;
@@ -5472,6 +5569,11 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
}
} else if (dst_regno >= 0) {
/* restore register state from stack */
+ if (env->bpf_capable)
+ /* Ensure stack slot has an ID to build a relation
+ * with the destination register on fill.
+ */
+ assign_scalar_id_before_mov(env, reg);
copy_register_state(&state->regs[dst_regno], reg);
/* mark reg as written since spilled pointer state likely
* has its liveness marks cleared by is_state_visited()
@@ -5654,8 +5756,8 @@ static int check_stack_write(struct bpf_verifier_env *env,
static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
int off, int size, enum bpf_access_type type)
{
- struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_map *map = regs[regno].map_ptr;
+ struct bpf_reg_state *reg = reg_state(env, regno);
+ struct bpf_map *map = reg->map_ptr;
u32 cap = bpf_map_flags_to_cap(map);
if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) {
@@ -6168,8 +6270,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
int size, bool zero_size_allowed)
{
- struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_reg_state *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
int err;
/* We may have added a variable offset to the packet pointer; but any
@@ -6256,8 +6357,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
u32 regno, int off, int size,
enum bpf_access_type t)
{
- struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_reg_state *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_insn_access_aux info = {};
bool valid;
@@ -7453,8 +7553,7 @@ static int check_stack_access_within_bounds(
int regno, int off, int access_size,
enum bpf_access_type type)
{
- struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_reg_state *reg = regs + regno;
+ struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_func_state *state = func(env, reg);
s64 min_off, max_off;
int err;
@@ -8408,7 +8507,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
{
bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK;
const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin";
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_verifier_state *cur = env->cur_state;
bool is_const = tnum_is_const(reg->var_off);
bool is_irq = flags & PROCESS_LOCK_IRQ;
@@ -8522,9 +8621,10 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
/* Check if @regno is a pointer to a specific field in a map value */
static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
- enum btf_field_type field_type)
+ enum btf_field_type field_type,
+ struct bpf_map_desc *map_desc)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
bool is_const = tnum_is_const(reg->var_off);
struct bpf_map *map = reg->map_ptr;
u64 val = reg->var_off.value;
@@ -8565,78 +8665,41 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
val + reg->off, struct_name, field_off);
return -EINVAL;
}
+ if (map_desc->ptr) {
+ verifier_bug(env, "Two map pointers in a %s helper", struct_name);
+ return -EFAULT;
+ }
+ map_desc->uid = reg->map_uid;
+ map_desc->ptr = map;
return 0;
}
static int process_timer_func(struct bpf_verifier_env *env, int regno,
- struct bpf_call_arg_meta *meta)
+ struct bpf_map_desc *map)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- struct bpf_map *map = reg->map_ptr;
- int err;
-
- err = check_map_field_pointer(env, regno, BPF_TIMER);
- if (err)
- return err;
-
- if (meta->map_ptr) {
- verifier_bug(env, "Two map pointers in a timer helper");
- return -EFAULT;
- }
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n");
return -EOPNOTSUPP;
}
- meta->map_uid = reg->map_uid;
- meta->map_ptr = map;
- return 0;
+ return check_map_field_pointer(env, regno, BPF_TIMER, map);
}
-static int process_wq_func(struct bpf_verifier_env *env, int regno,
- struct bpf_kfunc_call_arg_meta *meta)
+static int process_timer_helper(struct bpf_verifier_env *env, int regno,
+ struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- struct bpf_map *map = reg->map_ptr;
- int err;
-
- err = check_map_field_pointer(env, regno, BPF_WORKQUEUE);
- if (err)
- return err;
-
- if (meta->map.ptr) {
- verifier_bug(env, "Two map pointers in a bpf_wq helper");
- return -EFAULT;
- }
-
- meta->map.uid = reg->map_uid;
- meta->map.ptr = map;
- return 0;
+ return process_timer_func(env, regno, &meta->map);
}
-static int process_task_work_func(struct bpf_verifier_env *env, int regno,
- struct bpf_kfunc_call_arg_meta *meta)
+static int process_timer_kfunc(struct bpf_verifier_env *env, int regno,
+ struct bpf_kfunc_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- struct bpf_map *map = reg->map_ptr;
- int err;
-
- err = check_map_field_pointer(env, regno, BPF_TASK_WORK);
- if (err)
- return err;
-
- if (meta->map.ptr) {
- verifier_bug(env, "Two map pointers in a bpf_task_work helper");
- return -EFAULT;
- }
- meta->map.uid = reg->map_uid;
- meta->map.ptr = map;
- return 0;
+ return process_timer_func(env, regno, &meta->map);
}
static int process_kptr_func(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
struct btf_field *kptr_field;
struct bpf_map *map_ptr;
struct btf_record *rec;
@@ -8652,7 +8715,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
return -EINVAL;
}
rec = map_ptr->record;
- meta->map_ptr = map_ptr;
+ meta->map.ptr = map_ptr;
}
if (!tnum_is_const(reg->var_off)) {
@@ -8709,7 +8772,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx,
enum bpf_arg_type arg_type, int clone_ref_obj_id)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
int err;
if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) {
@@ -8829,7 +8892,7 @@ static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx,
static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
struct bpf_kfunc_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
const struct btf_type *t;
int spi, err, i, nr_slots, btf_id;
@@ -8944,15 +9007,24 @@ static bool regs_exact(const struct bpf_reg_state *rold,
const struct bpf_reg_state *rcur,
struct bpf_idmap *idmap);
+/*
+ * Check if scalar registers are exact for the purpose of not widening.
+ * More lenient than regs_exact()
+ */
+static bool scalars_exact_for_widen(const struct bpf_reg_state *rold,
+ const struct bpf_reg_state *rcur)
+{
+ return !memcmp(rold, rcur, offsetof(struct bpf_reg_state, id));
+}
+
static void maybe_widen_reg(struct bpf_verifier_env *env,
- struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
- struct bpf_idmap *idmap)
+ struct bpf_reg_state *rold, struct bpf_reg_state *rcur)
{
if (rold->type != SCALAR_VALUE)
return;
if (rold->type != rcur->type)
return;
- if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap))
+ if (rold->precise || rcur->precise || scalars_exact_for_widen(rold, rcur))
return;
__mark_reg_unknown(env, rcur);
}
@@ -8964,7 +9036,6 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env,
struct bpf_func_state *fold, *fcur;
int i, fr, num_slots;
- reset_idmap_scratch(env);
for (fr = old->curframe; fr >= 0; fr--) {
fold = old->frame[fr];
fcur = cur->frame[fr];
@@ -8972,8 +9043,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env,
for (i = 0; i < MAX_BPF_REG; i++)
maybe_widen_reg(env,
&fold->regs[i],
- &fcur->regs[i],
- &env->idmap_scratch);
+ &fcur->regs[i]);
num_slots = min(fold->allocated_stack / BPF_REG_SIZE,
fcur->allocated_stack / BPF_REG_SIZE);
@@ -8984,8 +9054,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env,
maybe_widen_reg(env,
&fold->stack[i].spilled_ptr,
- &fcur->stack[i].spilled_ptr,
- &env->idmap_scratch);
+ &fcur->stack[i].spilled_ptr);
}
}
return 0;
@@ -9159,13 +9228,13 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env,
const struct bpf_call_arg_meta *meta,
enum bpf_arg_type *arg_type)
{
- if (!meta->map_ptr) {
+ if (!meta->map.ptr) {
/* kernel subsystem misconfigured verifier */
verifier_bug(env, "invalid map_ptr to access map->type");
return -EFAULT;
}
- switch (meta->map_ptr->map_type) {
+ switch (meta->map.ptr->map_type) {
case BPF_MAP_TYPE_SOCKMAP:
case BPF_MAP_TYPE_SOCKHASH:
if (*arg_type == ARG_PTR_TO_MAP_VALUE) {
@@ -9301,7 +9370,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
const u32 *arg_btf_id,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
enum bpf_reg_type expected, type = reg->type;
const struct bpf_reg_types *compatible;
int i, j;
@@ -9719,7 +9788,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
int insn_idx)
{
u32 regno = BPF_REG_1 + arg;
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
enum bpf_arg_type arg_type = fn->arg_type[arg];
enum bpf_reg_type type = reg->type;
u32 *arg_btf_id = NULL;
@@ -9819,7 +9888,7 @@ skip_type_check:
switch (base_type(arg_type)) {
case ARG_CONST_MAP_PTR:
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
- if (meta->map_ptr) {
+ if (meta->map.ptr) {
/* Use map_uid (which is unique id of inner map) to reject:
* inner_map1 = bpf_map_lookup_elem(outer_map, key1)
* inner_map2 = bpf_map_lookup_elem(outer_map, key2)
@@ -9832,23 +9901,23 @@ skip_type_check:
*
* Comparing map_ptr is enough to distinguish normal and outer maps.
*/
- if (meta->map_ptr != reg->map_ptr ||
- meta->map_uid != reg->map_uid) {
+ if (meta->map.ptr != reg->map_ptr ||
+ meta->map.uid != reg->map_uid) {
verbose(env,
"timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
- meta->map_uid, reg->map_uid);
+ meta->map.uid, reg->map_uid);
return -EINVAL;
}
}
- meta->map_ptr = reg->map_ptr;
- meta->map_uid = reg->map_uid;
+ meta->map.ptr = reg->map_ptr;
+ meta->map.uid = reg->map_uid;
break;
case ARG_PTR_TO_MAP_KEY:
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
* stack limits and initialized
*/
- if (!meta->map_ptr) {
+ if (!meta->map.ptr) {
/* in function declaration map_ptr must come before
* map_key, so that it's verified and known before
* we have to check map_key here. Otherwise it means
@@ -9857,11 +9926,11 @@ skip_type_check:
verifier_bug(env, "invalid map_ptr to access map->key");
return -EFAULT;
}
- key_size = meta->map_ptr->key_size;
+ key_size = meta->map.ptr->key_size;
err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL);
if (err)
return err;
- if (can_elide_value_nullness(meta->map_ptr->map_type)) {
+ if (can_elide_value_nullness(meta->map.ptr->map_type)) {
err = get_constant_map_key(env, reg, key_size, &meta->const_map_key);
if (err < 0) {
meta->const_map_key = -1;
@@ -9879,13 +9948,13 @@ skip_type_check:
/* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity
*/
- if (!meta->map_ptr) {
+ if (!meta->map.ptr) {
/* kernel subsystem misconfigured verifier */
verifier_bug(env, "invalid map_ptr to access map->value");
return -EFAULT;
}
meta->raw_mode = arg_type & MEM_UNINIT;
- err = check_helper_mem_access(env, regno, meta->map_ptr->value_size,
+ err = check_helper_mem_access(env, regno, meta->map.ptr->value_size,
arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
false, meta);
break;
@@ -9916,7 +9985,7 @@ skip_type_check:
}
break;
case ARG_PTR_TO_TIMER:
- err = process_timer_func(env, regno, meta);
+ err = process_timer_helper(env, regno, meta);
if (err)
return err;
break;
@@ -10354,10 +10423,27 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn)
return true;
}
-static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
+static bool check_mem_arg_rw_flag_ok(const struct bpf_func_proto *fn)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
+ enum bpf_arg_type arg_type = fn->arg_type[i];
+
+ if (base_type(arg_type) != ARG_PTR_TO_MEM)
+ continue;
+ if (!(arg_type & (MEM_WRITE | MEM_RDONLY)))
+ return false;
+ }
+
+ return true;
+}
+
+static int check_func_proto(const struct bpf_func_proto *fn)
{
return check_raw_mode_ok(fn) &&
check_arg_pair_ok(fn) &&
+ check_mem_arg_rw_flag_ok(fn) &&
check_btf_id_ok(fn) ? 0 : -EINVAL;
}
@@ -11206,7 +11292,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
int func_id, int insn_idx)
{
struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
- struct bpf_map *map = meta->map_ptr;
+ struct bpf_map *map = meta->map.ptr;
if (func_id != BPF_FUNC_tail_call &&
func_id != BPF_FUNC_map_lookup_elem &&
@@ -11239,11 +11325,11 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
}
if (!aux->map_ptr_state.map_ptr)
- bpf_map_ptr_store(aux, meta->map_ptr,
- !meta->map_ptr->bypass_spec_v1, false);
- else if (aux->map_ptr_state.map_ptr != meta->map_ptr)
- bpf_map_ptr_store(aux, meta->map_ptr,
- !meta->map_ptr->bypass_spec_v1, true);
+ bpf_map_ptr_store(aux, meta->map.ptr,
+ !meta->map.ptr->bypass_spec_v1, false);
+ else if (aux->map_ptr_state.map_ptr != meta->map.ptr)
+ bpf_map_ptr_store(aux, meta->map.ptr,
+ !meta->map.ptr->bypass_spec_v1, true);
return 0;
}
@@ -11252,8 +11338,8 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
int func_id, int insn_idx)
{
struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
- struct bpf_reg_state *regs = cur_regs(env), *reg;
- struct bpf_map *map = meta->map_ptr;
+ struct bpf_reg_state *reg;
+ struct bpf_map *map = meta->map.ptr;
u64 val, max;
int err;
@@ -11264,7 +11350,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return -EINVAL;
}
- reg = &regs[BPF_REG_3];
+ reg = reg_state(env, BPF_REG_3);
val = reg->var_off.value;
max = map->max_entries;
@@ -11410,8 +11496,7 @@ static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env)
static bool loop_flag_is_zero(struct bpf_verifier_env *env)
{
- struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_reg_state *reg = &regs[BPF_REG_4];
+ struct bpf_reg_state *reg = reg_state(env, BPF_REG_4);
bool reg_is_null = register_is_null(reg);
if (reg_is_null)
@@ -11471,6 +11556,7 @@ static inline bool in_sleepable_context(struct bpf_verifier_env *env)
{
return !env->cur_state->active_rcu_locks &&
!env->cur_state->active_preempt_locks &&
+ !env->cur_state->active_locks &&
!env->cur_state->active_irq_id &&
in_sleepable(env);
}
@@ -11529,7 +11615,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
memset(&meta, 0, sizeof(meta));
meta.pkt_access = fn->pkt_access;
- err = check_func_proto(fn, func_id);
+ err = check_func_proto(fn);
if (err) {
verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id);
return err;
@@ -11809,22 +11895,22 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
*/
- if (meta.map_ptr == NULL) {
+ if (meta.map.ptr == NULL) {
verifier_bug(env, "unexpected null map_ptr");
return -EFAULT;
}
if (func_id == BPF_FUNC_map_lookup_elem &&
- can_elide_value_nullness(meta.map_ptr->map_type) &&
+ can_elide_value_nullness(meta.map.ptr->map_type) &&
meta.const_map_key >= 0 &&
- meta.const_map_key < meta.map_ptr->max_entries)
+ meta.const_map_key < meta.map.ptr->max_entries)
ret_flag &= ~PTR_MAYBE_NULL;
- regs[BPF_REG_0].map_ptr = meta.map_ptr;
- regs[BPF_REG_0].map_uid = meta.map_uid;
+ regs[BPF_REG_0].map_ptr = meta.map.ptr;
+ regs[BPF_REG_0].map_uid = meta.map.uid;
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
if (!type_may_be_null(ret_flag) &&
- btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) {
+ btf_record_has_field(meta.map.ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) {
regs[BPF_REG_0].id = ++env->id_gen;
}
break;
@@ -11927,7 +12013,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (type_may_be_null(regs[BPF_REG_0].type))
regs[BPF_REG_0].id = ++env->id_gen;
- if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) {
+ if (helper_multiple_ref_obj_use(func_id, meta.map.ptr)) {
verifier_bug(env, "func %s#%d sets ref_obj_id more than once",
func_id_name(func_id), func_id);
return -EFAULT;
@@ -11939,7 +12025,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
- } else if (is_acquire_function(func_id, meta.map_ptr)) {
+ } else if (is_acquire_function(func_id, meta.map.ptr)) {
int id = acquire_reference(env, insn_idx);
if (id < 0)
@@ -11954,7 +12040,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (err)
return err;
- err = check_map_func_compatibility(env, meta.map_ptr, func_id);
+ err = check_map_func_compatibility(env, meta.map.ptr, func_id);
if (err)
return err;
@@ -12045,11 +12131,6 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_RELEASE;
}
-static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta)
-{
- return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta);
-}
-
static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
{
return meta->kfunc_flags & KF_SLEEPABLE;
@@ -12096,11 +12177,6 @@ static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
return btf_param_match_suffix(btf, arg, "__szk");
}
-static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg)
-{
- return btf_param_match_suffix(btf, arg, "__opt");
-}
-
static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
{
return btf_param_match_suffix(btf, arg, "__k");
@@ -12146,11 +12222,6 @@ static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param
return btf_param_match_suffix(btf, arg, "__irq_flag");
}
-static bool is_kfunc_arg_prog(const struct btf *btf, const struct btf_param *arg)
-{
- return btf_param_match_suffix(btf, arg, "__prog");
-}
-
static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
const struct btf_param *arg,
const char *name)
@@ -12179,6 +12250,8 @@ enum {
KF_ARG_WORKQUEUE_ID,
KF_ARG_RES_SPIN_LOCK_ID,
KF_ARG_TASK_WORK_ID,
+ KF_ARG_PROG_AUX_ID,
+ KF_ARG_TIMER_ID
};
BTF_ID_LIST(kf_arg_btf_ids)
@@ -12190,6 +12263,8 @@ BTF_ID(struct, bpf_rb_node)
BTF_ID(struct, bpf_wq)
BTF_ID(struct, bpf_res_spin_lock)
BTF_ID(struct, bpf_task_work)
+BTF_ID(struct, bpf_prog_aux)
+BTF_ID(struct, bpf_timer)
static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
const struct btf_param *arg, int type)
@@ -12233,6 +12308,11 @@ static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_par
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID);
}
+static bool is_kfunc_arg_timer(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TIMER_ID);
+}
+
static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg)
{
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID);
@@ -12270,6 +12350,11 @@ static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf
return true;
}
+static bool is_kfunc_arg_prog_aux(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_PROG_AUX_ID);
+}
+
/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
const struct btf *btf,
@@ -12327,6 +12412,7 @@ enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_NULL,
KF_ARG_PTR_TO_CONST_STR,
KF_ARG_PTR_TO_MAP,
+ KF_ARG_PTR_TO_TIMER,
KF_ARG_PTR_TO_WORKQUEUE,
KF_ARG_PTR_TO_IRQ_FLAG,
KF_ARG_PTR_TO_RES_SPIN_LOCK,
@@ -12363,7 +12449,7 @@ enum special_kfunc_type {
KF_bpf_percpu_obj_new_impl,
KF_bpf_percpu_obj_drop_impl,
KF_bpf_throw,
- KF_bpf_wq_set_callback_impl,
+ KF_bpf_wq_set_callback,
KF_bpf_preempt_disable,
KF_bpf_preempt_enable,
KF_bpf_iter_css_task_new,
@@ -12383,8 +12469,14 @@ enum special_kfunc_type {
KF_bpf_dynptr_from_file,
KF_bpf_dynptr_file_discard,
KF___bpf_trap,
- KF_bpf_task_work_schedule_signal_impl,
- KF_bpf_task_work_schedule_resume_impl,
+ KF_bpf_task_work_schedule_signal,
+ KF_bpf_task_work_schedule_resume,
+ KF_bpf_arena_alloc_pages,
+ KF_bpf_arena_free_pages,
+ KF_bpf_arena_reserve_pages,
+ KF_bpf_session_is_return,
+ KF_bpf_stream_vprintk,
+ KF_bpf_stream_print_stack,
};
BTF_ID_LIST(special_kfunc_list)
@@ -12424,7 +12516,7 @@ BTF_ID(func, bpf_dynptr_clone)
BTF_ID(func, bpf_percpu_obj_new_impl)
BTF_ID(func, bpf_percpu_obj_drop_impl)
BTF_ID(func, bpf_throw)
-BTF_ID(func, bpf_wq_set_callback_impl)
+BTF_ID(func, bpf_wq_set_callback)
BTF_ID(func, bpf_preempt_disable)
BTF_ID(func, bpf_preempt_enable)
#ifdef CONFIG_CGROUPS
@@ -12457,13 +12549,19 @@ BTF_ID(func, bpf_res_spin_unlock_irqrestore)
BTF_ID(func, bpf_dynptr_from_file)
BTF_ID(func, bpf_dynptr_file_discard)
BTF_ID(func, __bpf_trap)
-BTF_ID(func, bpf_task_work_schedule_signal_impl)
-BTF_ID(func, bpf_task_work_schedule_resume_impl)
+BTF_ID(func, bpf_task_work_schedule_signal)
+BTF_ID(func, bpf_task_work_schedule_resume)
+BTF_ID(func, bpf_arena_alloc_pages)
+BTF_ID(func, bpf_arena_free_pages)
+BTF_ID(func, bpf_arena_reserve_pages)
+BTF_ID(func, bpf_session_is_return)
+BTF_ID(func, bpf_stream_vprintk)
+BTF_ID(func, bpf_stream_print_stack)
static bool is_task_work_add_kfunc(u32 func_id)
{
- return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] ||
- func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl];
+ return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] ||
+ func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume];
}
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
@@ -12513,9 +12611,16 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
struct bpf_reg_state *reg = &regs[regno];
bool arg_mem_size = false;
- if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx])
+ if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
+ meta->func_id == special_kfunc_list[KF_bpf_session_is_return] ||
+ meta->func_id == special_kfunc_list[KF_bpf_session_cookie])
return KF_ARG_PTR_TO_CTX;
+ if (argno + 1 < nargs &&
+ (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
+ is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
+ arg_mem_size = true;
+
/* In this function, we verify the kfunc's BTF as per the argument type,
* leaving the rest of the verification with respect to the register
* type to our caller. When a set of conditions hold in the BTF type of
@@ -12524,7 +12629,8 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
return KF_ARG_PTR_TO_CTX;
- if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
+ if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg) &&
+ !arg_mem_size)
return KF_ARG_PTR_TO_NULL;
if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
@@ -12560,6 +12666,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_wq(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_WORKQUEUE;
+ if (is_kfunc_arg_timer(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_TIMER;
+
if (is_kfunc_arg_task_work(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_TASK_WORK;
@@ -12581,11 +12690,6 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
return KF_ARG_PTR_TO_CALLBACK;
- if (argno + 1 < nargs &&
- (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
- is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
- arg_mem_size = true;
-
/* This is the catch all argument type of register types supported by
* check_helper_mem_access. However, we only allow when argument type is
* pointer to scalar, or struct composed (recursively) of scalars. When
@@ -12625,7 +12729,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
/* Enforce strict type matching for calls to kfuncs that are acquiring
* or releasing a reference, or are no-cast aliases. We do _not_
- * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default,
+ * enforce strict matching for kfuncs by default,
* as we want to enable BPF programs to pass types that are bitwise
* equivalent without forcing them to explicitly cast with something
* like bpf_cast_to_kern_ctx().
@@ -12675,7 +12779,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
static int process_irq_flag(struct bpf_verifier_env *env, int regno,
struct bpf_kfunc_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
int err, kfunc_class = IRQ_NATIVE_KFUNC;
bool irq_save;
@@ -12893,10 +12997,24 @@ static bool is_bpf_res_spin_lock_kfunc(u32 btf_id)
btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore];
}
+static bool is_bpf_arena_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_arena_alloc_pages] ||
+ btf_id == special_kfunc_list[KF_bpf_arena_free_pages] ||
+ btf_id == special_kfunc_list[KF_bpf_arena_reserve_pages];
+}
+
+static bool is_bpf_stream_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_stream_vprintk] ||
+ btf_id == special_kfunc_list[KF_bpf_stream_print_stack];
+}
+
static bool kfunc_spin_allowed(u32 btf_id)
{
return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) ||
- is_bpf_res_spin_lock_kfunc(btf_id);
+ is_bpf_res_spin_lock_kfunc(btf_id) || is_bpf_arena_kfunc(btf_id) ||
+ is_bpf_stream_kfunc(btf_id);
}
static bool is_sync_callback_calling_kfunc(u32 btf_id)
@@ -12906,7 +13024,7 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id)
static bool is_async_callback_calling_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] ||
+ return is_bpf_wq_set_callback_kfunc(btf_id) ||
is_task_work_add_kfunc(btf_id);
}
@@ -12916,9 +13034,9 @@ static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
insn->imm == special_kfunc_list[KF_bpf_throw];
}
-static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id)
+static bool is_bpf_wq_set_callback_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
+ return btf_id == special_kfunc_list[KF_bpf_wq_set_callback];
}
static bool is_callback_calling_kfunc(u32 btf_id)
@@ -13192,8 +13310,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (is_kfunc_arg_ignore(btf, &args[i]))
continue;
- if (is_kfunc_arg_prog(btf, &args[i])) {
- /* Used to reject repeated use of __prog. */
+ if (is_kfunc_arg_prog_aux(btf, &args[i])) {
+ /* Reject repeated use bpf_prog_aux */
if (meta->arg_prog) {
verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc");
return -EFAULT;
@@ -13254,9 +13372,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EINVAL;
}
- if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
- (register_is_null(reg) || type_may_be_null(reg->type)) &&
- !is_kfunc_arg_nullable(meta->btf, &args[i])) {
+ if ((register_is_null(reg) || type_may_be_null(reg->type)) &&
+ !is_kfunc_arg_nullable(meta->btf, &args[i])) {
verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
return -EACCES;
}
@@ -13321,9 +13438,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
fallthrough;
case KF_ARG_PTR_TO_ALLOC_BTF_ID:
case KF_ARG_PTR_TO_BTF_ID:
- if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
- break;
-
if (!is_trusted_reg(reg)) {
if (!is_kfunc_rcu(meta)) {
verbose(env, "R%d must be referenced or trusted\n", regno);
@@ -13348,6 +13462,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
case KF_ARG_PTR_TO_CONST_STR:
case KF_ARG_PTR_TO_WORKQUEUE:
+ case KF_ARG_PTR_TO_TIMER:
case KF_ARG_PTR_TO_TASK_WORK:
case KF_ARG_PTR_TO_IRQ_FLAG:
case KF_ARG_PTR_TO_RES_SPIN_LOCK:
@@ -13575,7 +13690,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
struct bpf_reg_state *size_reg = &regs[regno + 1];
const struct btf_param *size_arg = &args[i + 1];
- if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) {
+ if (!register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) {
ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
if (ret < 0) {
verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
@@ -13643,7 +13758,16 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
verbose(env, "arg#%d doesn't point to a map value\n", i);
return -EINVAL;
}
- ret = process_wq_func(env, regno, meta);
+ ret = check_map_field_pointer(env, regno, BPF_WORKQUEUE, &meta->map);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_TIMER:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#%d doesn't point to a map value\n", i);
+ return -EINVAL;
+ }
+ ret = process_timer_kfunc(env, regno, meta);
if (ret < 0)
return ret;
break;
@@ -13652,7 +13776,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
verbose(env, "arg#%d doesn't point to a map value\n", i);
return -EINVAL;
}
- ret = process_task_work_func(env, regno, meta);
+ ret = check_map_field_pointer(env, regno, BPF_TASK_WORK, &meta->map);
if (ret < 0)
return ret;
break;
@@ -13699,44 +13823,28 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return 0;
}
-static int fetch_kfunc_meta(struct bpf_verifier_env *env,
- struct bpf_insn *insn,
- struct bpf_kfunc_call_arg_meta *meta,
- const char **kfunc_name)
+static int fetch_kfunc_arg_meta(struct bpf_verifier_env *env,
+ s32 func_id,
+ s16 offset,
+ struct bpf_kfunc_call_arg_meta *meta)
{
- const struct btf_type *func, *func_proto;
- u32 func_id, *kfunc_flags;
- const char *func_name;
- struct btf *desc_btf;
-
- if (kfunc_name)
- *kfunc_name = NULL;
+ struct bpf_kfunc_meta kfunc;
+ int err;
- if (!insn->imm)
- return -EINVAL;
+ err = fetch_kfunc_meta(env, func_id, offset, &kfunc);
+ if (err)
+ return err;
- desc_btf = find_kfunc_desc_btf(env, insn->off);
- if (IS_ERR(desc_btf))
- return PTR_ERR(desc_btf);
+ memset(meta, 0, sizeof(*meta));
+ meta->btf = kfunc.btf;
+ meta->func_id = kfunc.id;
+ meta->func_proto = kfunc.proto;
+ meta->func_name = kfunc.name;
- func_id = insn->imm;
- func = btf_type_by_id(desc_btf, func_id);
- func_name = btf_name_by_offset(desc_btf, func->name_off);
- if (kfunc_name)
- *kfunc_name = func_name;
- func_proto = btf_type_by_id(desc_btf, func->type);
-
- kfunc_flags = btf_kfunc_id_set_contains(desc_btf, func_id, env->prog);
- if (!kfunc_flags) {
+ if (!kfunc.flags || !btf_kfunc_is_allowed(kfunc.btf, kfunc.id, env->prog))
return -EACCES;
- }
- memset(meta, 0, sizeof(*meta));
- meta->btf = desc_btf;
- meta->func_id = func_id;
- meta->kfunc_flags = *kfunc_flags;
- meta->func_proto = func_proto;
- meta->func_name = func_name;
+ meta->kfunc_flags = *kfunc.flags;
return 0;
}
@@ -13941,12 +14049,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (!insn->imm)
return 0;
- err = fetch_kfunc_meta(env, insn, &meta, &func_name);
- if (err == -EACCES && func_name)
- verbose(env, "calling kernel function %s is not allowed\n", func_name);
+ err = fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta);
+ if (err == -EACCES && meta.func_name)
+ verbose(env, "calling kernel function %s is not allowed\n", meta.func_name);
if (err)
return err;
desc_btf = meta.btf;
+ func_name = meta.func_name;
insn_aux = &env->insn_aux_data[insn_idx];
insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
@@ -14016,7 +14125,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
meta.r0_rdonly = false;
}
- if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) {
+ if (is_bpf_wq_set_callback_kfunc(meta.func_id)) {
err = push_callback_call(env, insn, insn_idx, meta.subprogno,
set_timer_callback_state);
if (err) {
@@ -14154,8 +14263,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
- for (i = 0; i < CALLER_SAVED_REGS; i++)
- mark_reg_not_init(env, regs, caller_saved[i]);
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ u32 regno = caller_saved[i];
+
+ mark_reg_not_init(env, regs, regno);
+ regs[regno].subreg_def = DEF_NOT_SUBREG;
+ }
/* Check return type */
t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);
@@ -14220,26 +14333,38 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (is_kfunc_rcu_protected(&meta))
regs[BPF_REG_0].type |= MEM_RCU;
} else {
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].btf = desc_btf;
- regs[BPF_REG_0].type = PTR_TO_BTF_ID;
- regs[BPF_REG_0].btf_id = ptr_type_id;
+ enum bpf_reg_type type = PTR_TO_BTF_ID;
if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache])
- regs[BPF_REG_0].type |= PTR_UNTRUSTED;
- else if (is_kfunc_rcu_protected(&meta))
- regs[BPF_REG_0].type |= MEM_RCU;
-
- if (is_iter_next_kfunc(&meta)) {
- struct bpf_reg_state *cur_iter;
-
- cur_iter = get_iter_from_state(env->cur_state, &meta);
-
- if (cur_iter->type & MEM_RCU) /* KF_RCU_PROTECTED */
- regs[BPF_REG_0].type |= MEM_RCU;
- else
- regs[BPF_REG_0].type |= PTR_TRUSTED;
+ type |= PTR_UNTRUSTED;
+ else if (is_kfunc_rcu_protected(&meta) ||
+ (is_iter_next_kfunc(&meta) &&
+ (get_iter_from_state(env->cur_state, &meta)
+ ->type & MEM_RCU))) {
+ /*
+ * If the iterator's constructor (the _new
+ * function e.g., bpf_iter_task_new) has been
+ * annotated with BPF kfunc flag
+ * KF_RCU_PROTECTED and was called within a RCU
+ * read-side critical section, also propagate
+ * the MEM_RCU flag to the pointer returned from
+ * the iterator's next function (e.g.,
+ * bpf_iter_task_next).
+ */
+ type |= MEM_RCU;
+ } else {
+ /*
+ * Any PTR_TO_BTF_ID that is returned from a BPF
+ * kfunc should by default be treated as
+ * implicitly trusted.
+ */
+ type |= PTR_TRUSTED;
}
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].type = type;
+ regs[BPF_REG_0].btf_id = ptr_type_id;
}
if (is_kfunc_ret_null(&meta)) {
@@ -14295,6 +14420,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return err;
}
+ if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie])
+ env->prog->call_session_cookie = true;
+
return 0;
}
@@ -15081,6 +15209,252 @@ static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
}
}
+static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
+ u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */
+
+ *dst_umin = *dst_umin / src_val;
+ *dst_umax = *dst_umax / src_val;
+
+ /* Reset other ranges/tnum to unbounded/unknown. */
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
+ reset_reg64_and_tnum(dst_reg);
+}
+
+static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
+ u64 src_val = src_reg->umin_value; /* non-zero, const divisor */
+
+ *dst_umin = div64_u64(*dst_umin, src_val);
+ *dst_umax = div64_u64(*dst_umax, src_val);
+
+ /* Reset other ranges/tnum to unbounded/unknown. */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ reset_reg32_and_tnum(dst_reg);
+}
+
+static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
+ s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */
+ s32 res1, res2;
+
+ /* BPF div specification: S32_MIN / -1 = S32_MIN */
+ if (*dst_smin == S32_MIN && src_val == -1) {
+ /*
+ * If the dividend range contains more than just S32_MIN,
+ * we cannot precisely track the result, so it becomes unbounded.
+ * e.g., [S32_MIN, S32_MIN+10]/(-1),
+ * = {S32_MIN} U [-(S32_MIN+10), -(S32_MIN+1)]
+ * = {S32_MIN} U [S32_MAX-9, S32_MAX] = [S32_MIN, S32_MAX]
+ * Otherwise (if dividend is exactly S32_MIN), result remains S32_MIN.
+ */
+ if (*dst_smax != S32_MIN) {
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
+ }
+ goto reset;
+ }
+
+ res1 = *dst_smin / src_val;
+ res2 = *dst_smax / src_val;
+ *dst_smin = min(res1, res2);
+ *dst_smax = max(res1, res2);
+
+reset:
+ /* Reset other ranges/tnum to unbounded/unknown. */
+ dst_reg->u32_min_value = 0;
+ dst_reg->u32_max_value = U32_MAX;
+ reset_reg64_and_tnum(dst_reg);
+}
+
+static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
+ s64 src_val = src_reg->smin_value; /* non-zero, const divisor */
+ s64 res1, res2;
+
+ /* BPF div specification: S64_MIN / -1 = S64_MIN */
+ if (*dst_smin == S64_MIN && src_val == -1) {
+ /*
+ * If the dividend range contains more than just S64_MIN,
+ * we cannot precisely track the result, so it becomes unbounded.
+ * e.g., [S64_MIN, S64_MIN+10]/(-1),
+ * = {S64_MIN} U [-(S64_MIN+10), -(S64_MIN+1)]
+ * = {S64_MIN} U [S64_MAX-9, S64_MAX] = [S64_MIN, S64_MAX]
+ * Otherwise (if dividend is exactly S64_MIN), result remains S64_MIN.
+ */
+ if (*dst_smax != S64_MIN) {
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
+ }
+ goto reset;
+ }
+
+ res1 = div64_s64(*dst_smin, src_val);
+ res2 = div64_s64(*dst_smax, src_val);
+ *dst_smin = min(res1, res2);
+ *dst_smax = max(res1, res2);
+
+reset:
+ /* Reset other ranges/tnum to unbounded/unknown. */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ reset_reg32_and_tnum(dst_reg);
+}
+
+static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
+ u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */
+ u32 res_max = src_val - 1;
+
+ /*
+ * If dst_umax <= res_max, the result remains unchanged.
+ * e.g., [2, 5] % 10 = [2, 5].
+ */
+ if (*dst_umax <= res_max)
+ return;
+
+ *dst_umin = 0;
+ *dst_umax = min(*dst_umax, res_max);
+
+ /* Reset other ranges/tnum to unbounded/unknown. */
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
+ reset_reg64_and_tnum(dst_reg);
+}
+
+static void scalar_min_max_umod(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
+ u64 src_val = src_reg->umin_value; /* non-zero, const divisor */
+ u64 res_max = src_val - 1;
+
+ /*
+ * If dst_umax <= res_max, the result remains unchanged.
+ * e.g., [2, 5] % 10 = [2, 5].
+ */
+ if (*dst_umax <= res_max)
+ return;
+
+ *dst_umin = 0;
+ *dst_umax = min(*dst_umax, res_max);
+
+ /* Reset other ranges/tnum to unbounded/unknown. */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ reset_reg32_and_tnum(dst_reg);
+}
+
+static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
+ s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */
+
+ /*
+ * Safe absolute value calculation:
+ * If src_val == S32_MIN (-2147483648), src_abs becomes 2147483648.
+ * Here use unsigned integer to avoid overflow.
+ */
+ u32 src_abs = (src_val > 0) ? (u32)src_val : -(u32)src_val;
+
+ /*
+ * Calculate the maximum possible absolute value of the result.
+ * Even if src_abs is 2147483648 (S32_MIN), subtracting 1 gives
+ * 2147483647 (S32_MAX), which fits perfectly in s32.
+ */
+ s32 res_max_abs = src_abs - 1;
+
+ /*
+ * If the dividend is already within the result range,
+ * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5].
+ */
+ if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs)
+ return;
+
+ /* General case: result has the same sign as the dividend. */
+ if (*dst_smin >= 0) {
+ *dst_smin = 0;
+ *dst_smax = min(*dst_smax, res_max_abs);
+ } else if (*dst_smax <= 0) {
+ *dst_smax = 0;
+ *dst_smin = max(*dst_smin, -res_max_abs);
+ } else {
+ *dst_smin = -res_max_abs;
+ *dst_smax = res_max_abs;
+ }
+
+ /* Reset other ranges/tnum to unbounded/unknown. */
+ dst_reg->u32_min_value = 0;
+ dst_reg->u32_max_value = U32_MAX;
+ reset_reg64_and_tnum(dst_reg);
+}
+
+static void scalar_min_max_smod(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
+ s64 src_val = src_reg->smin_value; /* non-zero, const divisor */
+
+ /*
+ * Safe absolute value calculation:
+ * If src_val == S64_MIN (-2^63), src_abs becomes 2^63.
+ * Here use unsigned integer to avoid overflow.
+ */
+ u64 src_abs = (src_val > 0) ? (u64)src_val : -(u64)src_val;
+
+ /*
+ * Calculate the maximum possible absolute value of the result.
+ * Even if src_abs is 2^63 (S64_MIN), subtracting 1 gives
+ * 2^63 - 1 (S64_MAX), which fits perfectly in s64.
+ */
+ s64 res_max_abs = src_abs - 1;
+
+ /*
+ * If the dividend is already within the result range,
+ * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5].
+ */
+ if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs)
+ return;
+
+ /* General case: result has the same sign as the dividend. */
+ if (*dst_smin >= 0) {
+ *dst_smin = 0;
+ *dst_smax = min(*dst_smax, res_max_abs);
+ } else if (*dst_smax <= 0) {
+ *dst_smax = 0;
+ *dst_smin = max(*dst_smin, -res_max_abs);
+ } else {
+ *dst_smin = -res_max_abs;
+ *dst_smax = res_max_abs;
+ }
+
+ /* Reset other ranges/tnum to unbounded/unknown. */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ reset_reg32_and_tnum(dst_reg);
+}
+
static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
@@ -15305,21 +15679,17 @@ static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg,
u64 umin_val, u64 umax_val)
{
/* Special case <<32 because it is a common compiler pattern to sign
- * extend subreg by doing <<32 s>>32. In this case if 32bit bounds are
- * positive we know this shift will also be positive so we can track
- * bounds correctly. Otherwise we lose all sign bit information except
- * what we can pick up from var_off. Perhaps we can generalize this
- * later to shifts of any length.
+ * extend subreg by doing <<32 s>>32. smin/smax assignments are correct
+ * because s32 bounds don't flip sign when shifting to the left by
+ * 32bits.
*/
- if (umin_val == 32 && umax_val == 32 && dst_reg->s32_max_value >= 0)
+ if (umin_val == 32 && umax_val == 32) {
dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32;
- else
- dst_reg->smax_value = S64_MAX;
-
- if (umin_val == 32 && umax_val == 32 && dst_reg->s32_min_value >= 0)
dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32;
- else
+ } else {
+ dst_reg->smax_value = S64_MAX;
dst_reg->smin_value = S64_MIN;
+ }
/* If we might shift our top bit out, then we know nothing */
if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
@@ -15462,6 +15832,48 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg,
__update_reg_bounds(dst_reg);
}
+static void scalar_byte_swap(struct bpf_reg_state *dst_reg, struct bpf_insn *insn)
+{
+ /*
+ * Byte swap operation - update var_off using tnum_bswap.
+ * Three cases:
+ * 1. bswap(16|32|64): opcode=0xd7 (BPF_END | BPF_ALU64 | BPF_TO_LE)
+ * unconditional swap
+ * 2. to_le(16|32|64): opcode=0xd4 (BPF_END | BPF_ALU | BPF_TO_LE)
+ * swap on big-endian, truncation or no-op on little-endian
+ * 3. to_be(16|32|64): opcode=0xdc (BPF_END | BPF_ALU | BPF_TO_BE)
+ * swap on little-endian, truncation or no-op on big-endian
+ */
+
+ bool alu64 = BPF_CLASS(insn->code) == BPF_ALU64;
+ bool to_le = BPF_SRC(insn->code) == BPF_TO_LE;
+ bool is_big_endian;
+#ifdef CONFIG_CPU_BIG_ENDIAN
+ is_big_endian = true;
+#else
+ is_big_endian = false;
+#endif
+ /* Apply bswap if alu64 or switch between big-endian and little-endian machines */
+ bool need_bswap = alu64 || (to_le == is_big_endian);
+
+ if (need_bswap) {
+ if (insn->imm == 16)
+ dst_reg->var_off = tnum_bswap16(dst_reg->var_off);
+ else if (insn->imm == 32)
+ dst_reg->var_off = tnum_bswap32(dst_reg->var_off);
+ else if (insn->imm == 64)
+ dst_reg->var_off = tnum_bswap64(dst_reg->var_off);
+ /*
+ * Byteswap scrambles the range, so we must reset bounds.
+ * Bounds will be re-derived from the new tnum later.
+ */
+ __mark_reg_unbounded(dst_reg);
+ }
+ /* For bswap16/32, truncate dst register to match the swapped size */
+ if (insn->imm == 16 || insn->imm == 32)
+ coerce_reg_to_size(dst_reg, insn->imm / 8);
+}
+
static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
const struct bpf_reg_state *src_reg)
{
@@ -15488,8 +15900,17 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
case BPF_XOR:
case BPF_OR:
case BPF_MUL:
+ case BPF_END:
return true;
+ /*
+ * Division and modulo operators range is only safe to compute when the
+ * divisor is a constant.
+ */
+ case BPF_DIV:
+ case BPF_MOD:
+ return src_is_const;
+
/* Shift operators range is only computable if shift dimension operand
* is a constant. Shifts greater than 31 or 63 are undefined. This
* includes shifts by a negative number.
@@ -15503,6 +15924,35 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
}
}
+static int maybe_fork_scalars(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ struct bpf_reg_state *dst_reg)
+{
+ struct bpf_verifier_state *branch;
+ struct bpf_reg_state *regs;
+ bool alu32;
+
+ if (dst_reg->smin_value == -1 && dst_reg->smax_value == 0)
+ alu32 = false;
+ else if (dst_reg->s32_min_value == -1 && dst_reg->s32_max_value == 0)
+ alu32 = true;
+ else
+ return 0;
+
+ branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+ if (IS_ERR(branch))
+ return PTR_ERR(branch);
+
+ regs = branch->frame[branch->curframe]->regs;
+ if (alu32) {
+ __mark_reg32_known(&regs[insn->dst_reg], 0);
+ __mark_reg32_known(dst_reg, -1ull);
+ } else {
+ __mark_reg_known(&regs[insn->dst_reg], 0);
+ __mark_reg_known(dst_reg, -1ull);
+ }
+ return 0;
+}
+
/* WARNING: This function does calculations on 64-bit values, but the actual
* execution may occur on 32-bit values. Therefore, things like bitshifts
* need extra checks in the 32-bit case.
@@ -15513,6 +15963,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
struct bpf_reg_state src_reg)
{
u8 opcode = BPF_OP(insn->code);
+ s16 off = insn->off;
bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
int ret;
@@ -15564,12 +16015,54 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
scalar32_min_max_mul(dst_reg, &src_reg);
scalar_min_max_mul(dst_reg, &src_reg);
break;
+ case BPF_DIV:
+ /* BPF div specification: x / 0 = 0 */
+ if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) {
+ ___mark_reg_known(dst_reg, 0);
+ break;
+ }
+ if (alu32)
+ if (off == 1)
+ scalar32_min_max_sdiv(dst_reg, &src_reg);
+ else
+ scalar32_min_max_udiv(dst_reg, &src_reg);
+ else
+ if (off == 1)
+ scalar_min_max_sdiv(dst_reg, &src_reg);
+ else
+ scalar_min_max_udiv(dst_reg, &src_reg);
+ break;
+ case BPF_MOD:
+ /* BPF mod specification: x % 0 = x */
+ if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0))
+ break;
+ if (alu32)
+ if (off == 1)
+ scalar32_min_max_smod(dst_reg, &src_reg);
+ else
+ scalar32_min_max_umod(dst_reg, &src_reg);
+ else
+ if (off == 1)
+ scalar_min_max_smod(dst_reg, &src_reg);
+ else
+ scalar_min_max_umod(dst_reg, &src_reg);
+ break;
case BPF_AND:
+ if (tnum_is_const(src_reg.var_off)) {
+ ret = maybe_fork_scalars(env, insn, dst_reg);
+ if (ret)
+ return ret;
+ }
dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_and(dst_reg, &src_reg);
scalar_min_max_and(dst_reg, &src_reg);
break;
case BPF_OR:
+ if (tnum_is_const(src_reg.var_off)) {
+ ret = maybe_fork_scalars(env, insn, dst_reg);
+ if (ret)
+ return ret;
+ }
dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_or(dst_reg, &src_reg);
scalar_min_max_or(dst_reg, &src_reg);
@@ -15597,12 +16090,23 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
else
scalar_min_max_arsh(dst_reg, &src_reg);
break;
+ case BPF_END:
+ scalar_byte_swap(dst_reg, insn);
+ break;
default:
break;
}
- /* ALU32 ops are zero extended into 64bit register */
- if (alu32)
+ /*
+ * ALU32 ops are zero extended into 64bit register.
+ *
+ * BPF_END is already handled inside the helper (truncation),
+ * so skip zext here to avoid unexpected zero extension.
+ * e.g., le64: opcode=(BPF_END|BPF_ALU|BPF_TO_LE), imm=0x40
+ * This is a 64bit byte swap operation with alu32==true,
+ * but we should not zero extend the result.
+ */
+ if (alu32 && opcode != BPF_END)
zext_32_to_64(dst_reg);
reg_bounds_sync(dst_reg);
return 0;
@@ -15705,6 +16209,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
verbose(env, "verifier internal error: no src_reg\n");
return -EFAULT;
}
+ /*
+ * For alu32 linked register tracking, we need to check dst_reg's
+ * umax_value before the ALU operation. After adjust_scalar_min_max_vals(),
+ * alu32 ops will have zero-extended the result, making umax_value <= U32_MAX.
+ */
+ u64 dst_umax = dst_reg->umax_value;
+
err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
if (err)
return err;
@@ -15714,26 +16225,44 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
* r1 += 0x1
* if r2 < 1000 goto ...
* use r1 in memory access
- * So for 64-bit alu remember constant delta between r2 and r1 and
- * update r1 after 'if' condition.
+ * So remember constant delta between r2 and r1 and update r1 after
+ * 'if' condition.
*/
if (env->bpf_capable &&
- BPF_OP(insn->code) == BPF_ADD && !alu32 &&
- dst_reg->id && is_reg_const(src_reg, false)) {
- u64 val = reg_const_value(src_reg, false);
+ (BPF_OP(insn->code) == BPF_ADD || BPF_OP(insn->code) == BPF_SUB) &&
+ dst_reg->id && is_reg_const(src_reg, alu32)) {
+ u64 val = reg_const_value(src_reg, alu32);
+ s32 off;
+
+ if (!alu32 && ((s64)val < S32_MIN || (s64)val > S32_MAX))
+ goto clear_id;
+
+ if (alu32 && (dst_umax > U32_MAX))
+ goto clear_id;
- if ((dst_reg->id & BPF_ADD_CONST) ||
- /* prevent overflow in sync_linked_regs() later */
- val > (u32)S32_MAX) {
+ off = (s32)val;
+
+ if (BPF_OP(insn->code) == BPF_SUB) {
+ /* Negating S32_MIN would overflow */
+ if (off == S32_MIN)
+ goto clear_id;
+ off = -off;
+ }
+
+ if (dst_reg->id & BPF_ADD_CONST) {
/*
* If the register already went through rX += val
* we cannot accumulate another val into rx->off.
*/
+clear_id:
dst_reg->off = 0;
dst_reg->id = 0;
} else {
- dst_reg->id |= BPF_ADD_CONST;
- dst_reg->off = val;
+ if (alu32)
+ dst_reg->id |= BPF_ADD_CONST32;
+ else
+ dst_reg->id |= BPF_ADD_CONST64;
+ dst_reg->off = off;
}
} else {
/*
@@ -15782,7 +16311,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- if (opcode == BPF_NEG &&
+ if ((opcode == BPF_NEG || opcode == BPF_END) &&
regs[insn->dst_reg].type == SCALAR_VALUE) {
err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
err = err ?: adjust_scalar_min_max_vals(env, insn,
@@ -16802,8 +17331,8 @@ static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id,
/* For all R in linked_regs, copy known_reg range into R
* if R->id == known_reg->id.
*/
-static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_state *known_reg,
- struct linked_regs *linked_regs)
+static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_state *vstate,
+ struct bpf_reg_state *known_reg, struct linked_regs *linked_regs)
{
struct bpf_reg_state fake_reg;
struct bpf_reg_state *reg;
@@ -16827,23 +17356,32 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s
} else {
s32 saved_subreg_def = reg->subreg_def;
s32 saved_off = reg->off;
+ u32 saved_id = reg->id;
fake_reg.type = SCALAR_VALUE;
- __mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off);
+ __mark_reg_known(&fake_reg, (s64)reg->off - (s64)known_reg->off);
/* reg = known_reg; reg += delta */
copy_register_state(reg, known_reg);
/*
- * Must preserve off, id and add_const flag,
+ * Must preserve off, id and subreg_def flag,
* otherwise another sync_linked_regs() will be incorrect.
*/
reg->off = saved_off;
+ reg->id = saved_id;
reg->subreg_def = saved_subreg_def;
scalar32_min_max_add(reg, &fake_reg);
scalar_min_max_add(reg, &fake_reg);
reg->var_off = tnum_add(reg->var_off, fake_reg.var_off);
+ if (known_reg->id & BPF_ADD_CONST32)
+ zext_32_to_64(reg);
+ reg_bounds_sync(reg);
}
+ if (e->is_reg)
+ mark_reg_scratched(env, e->regno);
+ else
+ mark_stack_slot_scratched(env, e->spi);
}
}
@@ -17030,13 +17568,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (BPF_SRC(insn->code) == BPF_X &&
src_reg->type == SCALAR_VALUE && src_reg->id &&
!WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
- sync_linked_regs(this_branch, src_reg, &linked_regs);
- sync_linked_regs(other_branch, &other_branch_regs[insn->src_reg], &linked_regs);
+ sync_linked_regs(env, this_branch, src_reg, &linked_regs);
+ sync_linked_regs(env, other_branch, &other_branch_regs[insn->src_reg],
+ &linked_regs);
}
if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
!WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
- sync_linked_regs(this_branch, dst_reg, &linked_regs);
- sync_linked_regs(other_branch, &other_branch_regs[insn->dst_reg], &linked_regs);
+ sync_linked_regs(env, this_branch, dst_reg, &linked_regs);
+ sync_linked_regs(env, other_branch, &other_branch_regs[insn->dst_reg],
+ &linked_regs);
}
/* if one pointer register is compared to another pointer
@@ -17411,6 +17951,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
switch (env->prog->expected_attach_type) {
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
+ case BPF_TRACE_FSESSION:
range = retval_range(0, 0);
break;
case BPF_TRACE_RAW_TP:
@@ -17693,6 +18234,10 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
switch (imm) {
#ifdef CONFIG_X86_64
case BPF_FUNC_get_smp_processor_id:
+#ifdef CONFIG_SMP
+ case BPF_FUNC_get_current_task_btf:
+ case BPF_FUNC_get_current_task:
+#endif
return env->prog->jit_requested && bpf_jit_supports_percpu_insn();
#endif
default:
@@ -17737,7 +18282,7 @@ static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call
if (bpf_pseudo_kfunc_call(call)) {
int err;
- err = fetch_kfunc_meta(env, call, &meta, NULL);
+ err = fetch_kfunc_arg_meta(env, call->imm, call->off, &meta);
if (err < 0)
/* error would be reported later */
return false;
@@ -18245,7 +18790,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
} else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
struct bpf_kfunc_call_arg_meta meta;
- ret = fetch_kfunc_meta(env, insn, &meta, NULL);
+ ret = fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta);
if (ret == 0 && is_iter_next_kfunc(&meta)) {
mark_prune_point(env, t);
/* Checking and saving state checkpoints at iter_next() call
@@ -18948,30 +19493,49 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
if (old_id == 0) /* cur_id == 0 as well */
return true;
- for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
- if (!map[i].old) {
- /* Reached an empty slot; haven't seen this id before */
- map[i].old = old_id;
- map[i].cur = cur_id;
- return true;
- }
+ for (i = 0; i < idmap->cnt; i++) {
if (map[i].old == old_id)
return map[i].cur == cur_id;
if (map[i].cur == cur_id)
return false;
}
+
+ /* Reached the end of known mappings; haven't seen this id before */
+ if (idmap->cnt < BPF_ID_MAP_SIZE) {
+ map[idmap->cnt].old = old_id;
+ map[idmap->cnt].cur = cur_id;
+ idmap->cnt++;
+ return true;
+ }
+
/* We ran out of idmap slots, which should be impossible */
WARN_ON_ONCE(1);
return false;
}
-/* Similar to check_ids(), but allocate a unique temporary ID
- * for 'old_id' or 'cur_id' of zero.
- * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid.
+/*
+ * Compare scalar register IDs for state equivalence.
+ *
+ * When old_id == 0, the old register is independent - not linked to any
+ * other register. Any linking in the current state only adds constraints,
+ * making it more restrictive. Since the old state didn't rely on any ID
+ * relationships for this register, it's always safe to accept cur regardless
+ * of its ID. Hence, return true immediately.
+ *
+ * When old_id != 0 but cur_id == 0, we need to ensure that different
+ * independent registers in cur don't incorrectly satisfy the ID matching
+ * requirements of linked registers in old.
+ *
+ * Example: if old has r6.id=X and r7.id=X (linked), but cur has r6.id=0
+ * and r7.id=0 (both independent), without temp IDs both would map old_id=X
+ * to cur_id=0 and pass. With temp IDs: r6 maps X->temp1, r7 tries to map
+ * X->temp2, but X is already mapped to temp1, so the check fails correctly.
*/
static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
{
- old_id = old_id ? old_id : ++idmap->tmp_id_gen;
+ if (!old_id)
+ return true;
+
cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen;
return check_ids(old_id, cur_id, idmap);
@@ -19045,6 +19609,72 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
* doesn't meant that the states are DONE. The verifier has to compare
* the callsites
*/
+
+/* Find id in idset and increment its count, or add new entry */
+static void idset_cnt_inc(struct bpf_idset *idset, u32 id)
+{
+ u32 i;
+
+ for (i = 0; i < idset->num_ids; i++) {
+ if (idset->entries[i].id == id) {
+ idset->entries[i].cnt++;
+ return;
+ }
+ }
+ /* New id */
+ if (idset->num_ids < BPF_ID_MAP_SIZE) {
+ idset->entries[idset->num_ids].id = id;
+ idset->entries[idset->num_ids].cnt = 1;
+ idset->num_ids++;
+ }
+}
+
+/* Find id in idset and return its count, or 0 if not found */
+static u32 idset_cnt_get(struct bpf_idset *idset, u32 id)
+{
+ u32 i;
+
+ for (i = 0; i < idset->num_ids; i++) {
+ if (idset->entries[i].id == id)
+ return idset->entries[i].cnt;
+ }
+ return 0;
+}
+
+/*
+ * Clear singular scalar ids in a state.
+ * A register with a non-zero id is called singular if no other register shares
+ * the same base id. Such registers can be treated as independent (id=0).
+ */
+static void clear_singular_ids(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
+{
+ struct bpf_idset *idset = &env->idset_scratch;
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+
+ idset->num_ids = 0;
+
+ bpf_for_each_reg_in_vstate(st, func, reg, ({
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ if (!reg->id)
+ continue;
+ idset_cnt_inc(idset, reg->id & ~BPF_ADD_CONST);
+ }));
+
+ bpf_for_each_reg_in_vstate(st, func, reg, ({
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ if (!reg->id)
+ continue;
+ if (idset_cnt_get(idset, reg->id & ~BPF_ADD_CONST) == 1) {
+ reg->id = 0;
+ reg->off = 0;
+ }
+ }));
+}
+
static void clean_live_states(struct bpf_verifier_env *env, int insn,
struct bpf_verifier_state *cur)
{
@@ -19091,11 +19721,9 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
if (exact == EXACT)
return regs_exact(rold, rcur, idmap);
- if (rold->type == NOT_INIT) {
- if (exact == NOT_EXACT || rcur->type == NOT_INIT)
- /* explored state can't have used this */
- return true;
- }
+ if (rold->type == NOT_INIT)
+ /* explored state can't have used this */
+ return true;
/* Enforce that register types have to match exactly, including their
* modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
@@ -19132,11 +19760,21 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
}
if (!rold->precise && exact == NOT_EXACT)
return true;
- if ((rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST))
- return false;
- if ((rold->id & BPF_ADD_CONST) && (rold->off != rcur->off))
- return false;
- /* Why check_ids() for scalar registers?
+ /*
+ * Linked register tracking uses rold->id to detect relationships.
+ * When rold->id == 0, the register is independent and any linking
+ * in rcur only adds constraints. When rold->id != 0, we must verify
+ * id mapping and (for BPF_ADD_CONST) offset consistency.
+ *
+ * +------------------+-----------+------------------+---------------+
+ * | | rold->id | rold + ADD_CONST | rold->id == 0 |
+ * |------------------+-----------+------------------+---------------|
+ * | rcur->id | range,ids | false | range |
+ * | rcur + ADD_CONST | false | range,ids,off | range |
+ * | rcur->id == 0 | range,ids | false | range |
+ * +------------------+-----------+------------------+---------------+
+ *
+ * Why check_ids() for scalar registers?
*
* Consider the following BPF code:
* 1: r6 = ... unbound scalar, ID=a ...
@@ -19160,9 +19798,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
* ---
* Also verify that new value satisfies old value range knowledge.
*/
- return range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off) &&
- check_scalar_ids(rold->id, rcur->id, idmap);
+
+ /* ADD_CONST mismatch: different linking semantics */
+ if ((rold->id & BPF_ADD_CONST) && !(rcur->id & BPF_ADD_CONST))
+ return false;
+
+ if (rold->id && !(rold->id & BPF_ADD_CONST) && (rcur->id & BPF_ADD_CONST))
+ return false;
+
+ /* Both have offset linkage: offsets must match */
+ if ((rold->id & BPF_ADD_CONST) && rold->off != rcur->off)
+ return false;
+
+ if (!check_scalar_ids(rold->id, rcur->id, idmap))
+ return false;
+
+ return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off);
case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
case PTR_TO_MEM:
@@ -19264,7 +19915,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
spi = i / BPF_REG_SIZE;
- if (exact != NOT_EXACT &&
+ if (exact == EXACT &&
(i >= cur->allocated_stack ||
old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
cur->stack[spi].slot_type[i % BPF_REG_SIZE]))
@@ -19470,8 +20121,10 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
static void reset_idmap_scratch(struct bpf_verifier_env *env)
{
- env->idmap_scratch.tmp_id_gen = env->id_gen;
- memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+ struct bpf_idmap *idmap = &env->idmap_scratch;
+
+ idmap->tmp_id_gen = env->id_gen;
+ idmap->cnt = 0;
}
static bool states_equal(struct bpf_verifier_env *env,
@@ -19835,8 +20488,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
}
}
if (bpf_calls_callback(env, insn_idx)) {
- if (states_equal(env, &sl->state, cur, RANGE_WITHIN))
+ if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
+ loop = true;
goto hit;
+ }
goto skip_inf_loop_check;
}
/* attempt to detect infinite loop to avoid unnecessary doomed work */
@@ -20041,6 +20696,8 @@ miss:
if (env->bpf_capable)
mark_all_scalars_imprecise(env, cur);
+ clear_singular_ids(env, cur);
+
/* add new state to the head of linked list */
new = &new_sl->state;
err = copy_verifier_state(new, cur);
@@ -20611,17 +21268,19 @@ static int do_check(struct bpf_verifier_env *env)
* may skip a nospec patched-in after the jump. This can
* currently never happen because nospec_result is only
* used for the write-ops
- * `*(size*)(dst_reg+off)=src_reg|imm32` which must
- * never skip the following insn. Still, add a warning
- * to document this in case nospec_result is used
- * elsewhere in the future.
+ * `*(size*)(dst_reg+off)=src_reg|imm32` and helper
+ * calls. These must never skip the following insn
+ * (i.e., bpf_insn_successors()'s opcode_info.can_jump
+ * is false). Still, add a warning to document this in
+ * case nospec_result is used elsewhere in the future.
*
* All non-branch instructions have a single
* fall-through edge. For these, nospec_result should
* already work.
*/
- if (verifier_bug_if(BPF_CLASS(insn->code) == BPF_JMP ||
- BPF_CLASS(insn->code) == BPF_JMP32, env,
+ if (verifier_bug_if((BPF_CLASS(insn->code) == BPF_JMP ||
+ BPF_CLASS(insn->code) == BPF_JMP32) &&
+ BPF_OP(insn->code) != BPF_CALL, env,
"speculation barrier after jump instruction may not have the desired effect"))
return -EFAULT;
process_bpf_exit:
@@ -20660,12 +21319,7 @@ static int find_btf_percpu_datasec(struct btf *btf)
* types to look at only module's own BTF types.
*/
n = btf_nr_types(btf);
- if (btf_is_module(btf))
- i = btf_nr_types(btf_vmlinux);
- else
- i = 1;
-
- for(; i < n; i++) {
+ for (i = btf_named_start_id(btf, true); i < n; i++) {
t = btf_type_by_id(btf, i);
if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC)
continue;
@@ -20890,20 +21544,6 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
- if (btf_record_has_field(map->record, BPF_TIMER)) {
- if (is_tracing_prog_type(prog_type)) {
- verbose(env, "tracing progs cannot use bpf_timer yet\n");
- return -EINVAL;
- }
- }
-
- if (btf_record_has_field(map->record, BPF_WORKQUEUE)) {
- if (is_tracing_prog_type(prog_type)) {
- verbose(env, "tracing progs cannot use bpf_wq yet\n");
- return -EINVAL;
- }
- }
-
if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
@@ -20935,6 +21575,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_STACK:
case BPF_MAP_TYPE_ARENA:
case BPF_MAP_TYPE_INSN_ARRAY:
+ case BPF_MAP_TYPE_PROG_ARRAY:
break;
default:
verbose(env,
@@ -21141,11 +21782,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
} else {
u32 off = insn[1].imm;
- if (off >= BPF_MAX_VAR_OFF) {
- verbose(env, "direct value offset of %u is not allowed\n", off);
- return -EINVAL;
- }
-
if (!map->ops->map_direct_value_addr) {
verbose(env, "no direct value access support for this map type\n");
return -EINVAL;
@@ -22446,6 +23082,12 @@ static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc
} else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
if (!env->insn_aux_data[insn_idx].non_sleepable)
addr = (unsigned long)bpf_dynptr_from_file_sleepable;
+ } else if (func_id == special_kfunc_list[KF_bpf_arena_alloc_pages]) {
+ if (env->insn_aux_data[insn_idx].non_sleepable)
+ addr = (unsigned long)bpf_arena_alloc_pages_non_sleepable;
+ } else if (func_id == special_kfunc_list[KF_bpf_arena_free_pages]) {
+ if (env->insn_aux_data[insn_idx].non_sleepable)
+ addr = (unsigned long)bpf_arena_free_pages_non_sleepable;
}
desc->addr = addr;
return 0;
@@ -22498,8 +23140,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (!bpf_jit_supports_far_kfunc_call())
insn->imm = BPF_CALL_IMM(desc->addr);
- if (insn->off)
- return 0;
+
if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
@@ -22565,6 +23206,36 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
*cnt = 1;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] &&
+ env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
+ /*
+ * inline the bpf_session_is_return() for fsession:
+ * bool bpf_session_is_return(void *ctx)
+ * {
+ * return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1;
+ * }
+ */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_IS_RETURN_SHIFT);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1);
+ *cnt = 3;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] &&
+ env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
+ /*
+ * inline bpf_session_cookie() for fsession:
+ * __u64 *bpf_session_cookie(void *ctx)
+ * {
+ * u64 off = (((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF;
+ * return &((u64 *)ctx)[-off];
+ * }
+ */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_COOKIE_INDEX_SHIFT);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
+ insn_buf[3] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ insn_buf[4] = BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1);
+ insn_buf[5] = BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0);
+ *cnt = 6;
}
if (env->insn_aux_data[insn_idx].arg_prog) {
@@ -23278,21 +23949,48 @@ patch_map_ops_generic:
insn = new_prog->insnsi + i + delta;
goto next_insn;
}
+
+ /* Implement bpf_get_current_task() and bpf_get_current_task_btf() inline. */
+ if ((insn->imm == BPF_FUNC_get_current_task || insn->imm == BPF_FUNC_get_current_task_btf) &&
+ verifier_inlines_helper_call(env, insn->imm)) {
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&current_task);
+ insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+ insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
#endif
/* Implement bpf_get_func_arg inline. */
if (prog_type == BPF_PROG_TYPE_TRACING &&
insn->imm == BPF_FUNC_get_func_arg) {
- /* Load nr_args from ctx - 8 */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
- insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
- insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
- insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
- insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
- insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
- insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
- insn_buf[7] = BPF_JMP_A(1);
- insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
- cnt = 9;
+ if (eatype == BPF_TRACE_RAW_TP) {
+ int nr_args = btf_type_vlen(prog->aux->attach_func_proto);
+
+ /* skip 'void *__data' in btf_trace_##name() and save to reg0 */
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1);
+ cnt = 1;
+ } else {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
+ cnt = 2;
+ }
+ insn_buf[cnt++] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
+ insn_buf[cnt++] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ insn_buf[cnt++] = BPF_JMP_A(1);
+ insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
@@ -23308,15 +24006,17 @@ patch_map_ops_generic:
if (prog_type == BPF_PROG_TYPE_TRACING &&
insn->imm == BPF_FUNC_get_func_ret) {
if (eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_TRACE_FSESSION ||
eatype == BPF_MODIFY_RETURN) {
/* Load nr_args from ctx - 8 */
insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
- insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
- insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
- insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
- insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
- insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0);
- cnt = 6;
+ insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
+ insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
+ insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ cnt = 7;
} else {
insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP);
cnt = 1;
@@ -23335,13 +24035,24 @@ patch_map_ops_generic:
/* Implement get_func_arg_cnt inline. */
if (prog_type == BPF_PROG_TYPE_TRACING &&
insn->imm == BPF_FUNC_get_func_arg_cnt) {
- /* Load nr_args from ctx - 8 */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ if (eatype == BPF_TRACE_RAW_TP) {
+ int nr_args = btf_type_vlen(prog->aux->attach_func_proto);
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+ /* skip 'void *__data' in btf_trace_##name() and save to reg0 */
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1);
+ cnt = 1;
+ } else {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
+ cnt = 2;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
+ delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
goto next_insn;
@@ -24252,7 +24963,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
prog_extension &&
(tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
- tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) {
+ tgt_prog->expected_attach_type == BPF_TRACE_FEXIT ||
+ tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) {
/* Program extensions can extend all program types
* except fentry/fexit. The reason is the following.
* The fentry/fexit programs are used for performance
@@ -24267,7 +24979,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
* beyond reasonable stack size. Hence extending fentry
* is not allowed.
*/
- bpf_log(log, "Cannot extend fentry/fexit\n");
+ bpf_log(log, "Cannot extend fentry/fexit/fsession\n");
return -EINVAL;
}
} else {
@@ -24351,6 +25063,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
case BPF_LSM_CGROUP:
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
+ case BPF_TRACE_FSESSION:
+ if (prog->expected_attach_type == BPF_TRACE_FSESSION &&
+ !bpf_jit_supports_fsession()) {
+ bpf_log(log, "JIT does not support fsession\n");
+ return -EOPNOTSUPP;
+ }
if (!btf_type_is_func(t)) {
bpf_log(log, "attach_btf_id %u is not a function\n",
btf_id);
@@ -24517,6 +25235,7 @@ static bool can_be_sleepable(struct bpf_prog *prog)
case BPF_TRACE_FEXIT:
case BPF_MODIFY_RETURN:
case BPF_TRACE_ITER:
+ case BPF_TRACE_FSESSION:
return true;
default:
return false;
@@ -24598,9 +25317,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
tgt_info.tgt_name);
return -EINVAL;
} else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
+ prog->expected_attach_type == BPF_TRACE_FSESSION ||
prog->expected_attach_type == BPF_MODIFY_RETURN) &&
btf_id_set_contains(&noreturn_deny, btf_id)) {
- verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n",
+ verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n",
tgt_info.tgt_name);
return -EINVAL;
}
@@ -24809,6 +25529,12 @@ static void compute_insn_live_regs(struct bpf_verifier_env *env,
case BPF_JMP32:
switch (code) {
case BPF_JA:
+ def = 0;
+ if (BPF_SRC(insn->code) == BPF_X)
+ use = dst;
+ else
+ use = 0;
+ break;
case BPF_JCOND:
def = 0;
use = 0;
@@ -25076,15 +25802,18 @@ dfs_continue:
}
/*
* Assign SCC number only if component has two or more elements,
- * or if component has a self reference.
+ * or if component has a self reference, or if instruction is a
+ * callback calling function (implicit loop).
*/
- assign_scc = stack[stack_sz - 1] != w;
- for (j = 0; j < succ->cnt; ++j) {
+ assign_scc = stack[stack_sz - 1] != w; /* two or more elements? */
+ for (j = 0; j < succ->cnt; ++j) { /* self reference? */
if (succ->items[j] == w) {
assign_scc = true;
break;
}
}
+ if (bpf_calls_callback(env, w)) /* implicit loop? */
+ assign_scc = true;
/* Pop component elements from stack */
do {
t = stack[--stack_sz];
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 22051b4f1ccb..3bfe37693d68 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -52,7 +52,7 @@ struct cgroup_fs_context {
bool cpuset_clone_children;
bool none; /* User explicitly requested empty subsystem */
bool all_ss; /* Seen 'all' option */
- u16 subsys_mask; /* Selected subsystems */
+ u32 subsys_mask; /* Selected subsystems */
char *name; /* Hierarchy name */
char *release_agent; /* Path for release notifications */
};
@@ -146,7 +146,7 @@ struct cgroup_mgctx {
struct cgroup_taskset tset;
/* subsystems affected by migration */
- u16 ss_mask;
+ u32 ss_mask;
};
#define CGROUP_TASKSET_INIT(tset) \
@@ -235,8 +235,8 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
void cgroup_favor_dynmods(struct cgroup_root *root, bool favor);
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_fs_context *ctx);
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
-int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask);
+int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask);
int cgroup_do_get_tree(struct fs_context *fc);
int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index a9e029b570c8..724950c4b690 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -28,7 +28,7 @@
#define CGROUP_PIDLIST_DESTROY_DELAY HZ
/* Controllers blocked by the commandline in v1 */
-static u16 cgroup_no_v1_mask;
+static u32 cgroup_no_v1_mask;
/* disable named v1 mounts */
static bool cgroup_no_v1_named;
@@ -1037,13 +1037,13 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
static int check_cgroupfs_options(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
- u16 mask = U16_MAX;
- u16 enabled = 0;
+ u32 mask = U32_MAX;
+ u32 enabled = 0;
struct cgroup_subsys *ss;
int i;
#ifdef CONFIG_CPUSETS
- mask = ~((u16)1 << cpuset_cgrp_id);
+ mask = ~((u32)1 << cpuset_cgrp_id);
#endif
for_each_subsys(ss, i)
if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) &&
@@ -1095,7 +1095,7 @@ int cgroup1_reconfigure(struct fs_context *fc)
struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
int ret = 0;
- u16 added_mask, removed_mask;
+ u32 added_mask, removed_mask;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
@@ -1343,7 +1343,7 @@ static int __init cgroup_no_v1(char *str)
continue;
if (!strcmp(token, "all")) {
- cgroup_no_v1_mask = U16_MAX;
+ cgroup_no_v1_mask = U32_MAX;
continue;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5f0d33b04910..8af4351536cf 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -203,13 +203,13 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
bool cgrp_dfl_visible;
/* some controllers are not supported in the default hierarchy */
-static u16 cgrp_dfl_inhibit_ss_mask;
+static u32 cgrp_dfl_inhibit_ss_mask;
/* some controllers are implicitly enabled on the default hierarchy */
-static u16 cgrp_dfl_implicit_ss_mask;
+static u32 cgrp_dfl_implicit_ss_mask;
/* some controllers can be threaded on the default hierarchy */
-static u16 cgrp_dfl_threaded_ss_mask;
+static u32 cgrp_dfl_threaded_ss_mask;
/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
@@ -231,10 +231,10 @@ static u64 css_serial_nr_next = 1;
* These bitmasks identify subsystems with specific features to avoid
* having to do iterative checks repeatedly.
*/
-static u16 have_fork_callback __read_mostly;
-static u16 have_exit_callback __read_mostly;
-static u16 have_release_callback __read_mostly;
-static u16 have_canfork_callback __read_mostly;
+static u32 have_fork_callback __read_mostly;
+static u32 have_exit_callback __read_mostly;
+static u32 have_release_callback __read_mostly;
+static u32 have_canfork_callback __read_mostly;
static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
@@ -472,13 +472,13 @@ static bool cgroup_is_valid_domain(struct cgroup *cgrp)
}
/* subsystems visibly enabled on a cgroup */
-static u16 cgroup_control(struct cgroup *cgrp)
+static u32 cgroup_control(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
- u16 root_ss_mask = cgrp->root->subsys_mask;
+ u32 root_ss_mask = cgrp->root->subsys_mask;
if (parent) {
- u16 ss_mask = parent->subtree_control;
+ u32 ss_mask = parent->subtree_control;
/* threaded cgroups can only have threaded controllers */
if (cgroup_is_threaded(cgrp))
@@ -493,12 +493,12 @@ static u16 cgroup_control(struct cgroup *cgrp)
}
/* subsystems enabled on a cgroup */
-static u16 cgroup_ss_mask(struct cgroup *cgrp)
+static u32 cgroup_ss_mask(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
if (parent) {
- u16 ss_mask = parent->subtree_ss_mask;
+ u32 ss_mask = parent->subtree_ss_mask;
/* threaded cgroups can only have threaded controllers */
if (cgroup_is_threaded(cgrp))
@@ -1633,9 +1633,9 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
* This function calculates which subsystems need to be enabled if
* @subtree_control is to be applied while restricted to @this_ss_mask.
*/
-static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
+static u32 cgroup_calc_subtree_ss_mask(u32 subtree_control, u32 this_ss_mask)
{
- u16 cur_ss_mask = subtree_control;
+ u32 cur_ss_mask = subtree_control;
struct cgroup_subsys *ss;
int ssid;
@@ -1644,7 +1644,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
while (true) {
- u16 new_ss_mask = cur_ss_mask;
+ u32 new_ss_mask = cur_ss_mask;
do_each_subsys_mask(ss, ssid, cur_ss_mask) {
new_ss_mask |= ss->depends_on;
@@ -1848,12 +1848,12 @@ err:
return ret;
}
-int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
+int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
int ssid, ret;
- u16 dfl_disable_ss_mask = 0;
+ u32 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -2149,7 +2149,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -3131,7 +3131,7 @@ void cgroup_procs_write_finish(struct task_struct *task,
put_task_struct(task);
}
-static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
+static void cgroup_print_ss_mask(struct seq_file *seq, u32 ss_mask)
{
struct cgroup_subsys *ss;
bool printed = false;
@@ -3496,9 +3496,9 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
cgroup_apply_control_disable(cgrp);
}
-static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
+static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u32 enable)
{
- u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
+ u32 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
/* if nothing is getting enabled, nothing to worry about */
if (!enable)
@@ -3541,7 +3541,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- u16 enable = 0, disable = 0;
+ u32 enable = 0, disable = 0;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
@@ -4945,7 +4945,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
rcu_read_lock();
css_for_each_child(child, css) {
- if (child->flags & CSS_ONLINE) {
+ if (css_is_online(child)) {
ret = true;
break;
}
@@ -5750,7 +5750,7 @@ static void offline_css(struct cgroup_subsys_state *css)
lockdep_assert_held(&cgroup_mutex);
- if (!(css->flags & CSS_ONLINE))
+ if (!css_is_online(css))
return;
if (ss->css_offline)
@@ -6347,7 +6347,7 @@ int __init cgroup_init(void)
struct cgroup_subsys *ss;
int ssid;
- BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
+ BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 32);
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index 01976c8e7d49..fd7d19842ded 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -9,6 +9,7 @@
#include <linux/cpuset.h>
#include <linux/spinlock.h>
#include <linux/union_find.h>
+#include <linux/sched/isolation.h>
/* See "Frequency meter" comments, below. */
@@ -144,17 +145,12 @@ struct cpuset {
*/
nodemask_t old_mems_allowed;
- struct fmeter fmeter; /* memory_pressure filter */
-
/*
* Tasks are being attached to this cpuset. Used to prevent
* zeroing cpus/mems_allowed between ->can_attach() and ->attach().
*/
int attach_in_progress;
- /* for custom sched domain */
- int relax_domain_level;
-
/* partition root state */
int partition_root_state;
@@ -179,10 +175,19 @@ struct cpuset {
/* Handle for cpuset.cpus.partition */
struct cgroup_file partition_file;
+#ifdef CONFIG_CPUSETS_V1
+ struct fmeter fmeter; /* memory_pressure filter */
+
+ /* for custom sched domain */
+ int relax_domain_level;
+
/* Used to merge intersecting subsets for generate_sched_domains */
struct uf_node node;
+#endif
};
+extern struct cpuset top_cpuset;
+
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct cpuset, css) : NULL;
@@ -240,6 +245,30 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
+/*
+ * Helper routine for generate_sched_domains().
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
+ */
+static inline int cpusets_overlap(struct cpuset *a, struct cpuset *b)
+{
+ return cpumask_intersects(a->effective_cpus, b->effective_cpus);
+}
+
+static inline int nr_cpusets(void)
+{
+ /* jump label reference count + the top-level cpuset */
+ return static_key_count(&cpusets_enabled_key.key) + 1;
+}
+
+static inline bool cpuset_is_populated(struct cpuset *cs)
+{
+ lockdep_assert_cpuset_lock_held();
+
+ /* Cpusets in the process of attaching should be considered as populated */
+ return cgroup_is_populated(cs->css.cgroup) ||
+ cs->attach_in_progress;
+}
+
/**
* cpuset_for_each_child - traverse online children of a cpuset
* @child_cs: loop cursor pointing to the current child
@@ -285,7 +314,6 @@ void cpuset_full_unlock(void);
*/
#ifdef CONFIG_CPUSETS_V1
extern struct cftype cpuset1_files[];
-void fmeter_init(struct fmeter *fmp);
void cpuset1_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk);
void cpuset1_update_tasks_flags(struct cpuset *cs);
@@ -293,8 +321,13 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
bool cpus_updated, bool mems_updated);
int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial);
+bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2);
+void cpuset1_init(struct cpuset *cs);
+void cpuset1_online_css(struct cgroup_subsys_state *css);
+int cpuset1_generate_sched_domains(cpumask_var_t **domains,
+ struct sched_domain_attr **attributes);
+
#else
-static inline void fmeter_init(struct fmeter *fmp) {}
static inline void cpuset1_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk) {}
static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {}
@@ -303,6 +336,13 @@ static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs,
bool cpus_updated, bool mems_updated) {}
static inline int cpuset1_validate_change(struct cpuset *cur,
struct cpuset *trial) { return 0; }
+static inline bool cpuset1_cpus_excl_conflict(struct cpuset *cs1,
+ struct cpuset *cs2) { return false; }
+static inline void cpuset1_init(struct cpuset *cs) {}
+static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {}
+static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains,
+ struct sched_domain_attr **attributes) { return 0; };
+
#endif /* CONFIG_CPUSETS_V1 */
#endif /* __CPUSET_INTERNAL_H */
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index 12e76774c75b..7a23b9e8778f 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -62,7 +62,7 @@ struct cpuset_remove_tasks_struct {
#define FM_SCALE 1000 /* faux fixed point scale */
/* Initialize a frequency meter */
-void fmeter_init(struct fmeter *fmp)
+static void fmeter_init(struct fmeter *fmp)
{
fmp->cnt = 0;
fmp->val = 0;
@@ -368,11 +368,44 @@ int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
if (par && !is_cpuset_subset(trial, par))
goto out;
+ /*
+ * Cpusets with tasks - existing or newly being attached - can't
+ * be changed to have empty cpus_allowed or mems_allowed.
+ */
+ ret = -ENOSPC;
+ if (cpuset_is_populated(cur)) {
+ if (!cpumask_empty(cur->cpus_allowed) &&
+ cpumask_empty(trial->cpus_allowed))
+ goto out;
+ if (!nodes_empty(cur->mems_allowed) &&
+ nodes_empty(trial->mems_allowed))
+ goto out;
+ }
+
ret = 0;
out:
return ret;
}
+/*
+ * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts
+ * to legacy (v1)
+ * @cs1: first cpuset to check
+ * @cs2: second cpuset to check
+ *
+ * Returns: true if CPU exclusivity conflict exists, false otherwise
+ *
+ * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect.
+ */
+bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+{
+ if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
+ return cpumask_intersects(cs1->cpus_allowed,
+ cs2->cpus_allowed);
+
+ return false;
+}
+
#ifdef CONFIG_PROC_PID_CPUSET
/*
* proc_cpuset_show()
@@ -499,6 +532,242 @@ out_unlock:
return retval;
}
+void cpuset1_init(struct cpuset *cs)
+{
+ fmeter_init(&cs->fmeter);
+ cs->relax_domain_level = -1;
+}
+
+void cpuset1_online_css(struct cgroup_subsys_state *css)
+{
+ struct cpuset *tmp_cs;
+ struct cgroup_subsys_state *pos_css;
+ struct cpuset *cs = css_cs(css);
+ struct cpuset *parent = parent_cs(cs);
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_cpuset_lock_held();
+
+ if (is_spread_page(parent))
+ set_bit(CS_SPREAD_PAGE, &cs->flags);
+ if (is_spread_slab(parent))
+ set_bit(CS_SPREAD_SLAB, &cs->flags);
+
+ if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
+ return;
+
+ /*
+ * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
+ * set. This flag handling is implemented in cgroup core for
+ * historical reasons - the flag may be specified during mount.
+ *
+ * Currently, if any sibling cpusets have exclusive cpus or mem, we
+ * refuse to clone the configuration - thereby refusing the task to
+ * be entered, and as a result refusing the sys_unshare() or
+ * clone() which initiated it. If this becomes a problem for some
+ * users who wish to allow that scenario, then this could be
+ * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
+ * (and likewise for mems) to the new cgroup.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(tmp_cs, pos_css, parent) {
+ if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+
+ cpuset_callback_lock_irq();
+ cs->mems_allowed = parent->mems_allowed;
+ cs->effective_mems = parent->mems_allowed;
+ cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+ cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
+ cpuset_callback_unlock_irq();
+}
+
+static void
+update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
+{
+ if (dattr->relax_domain_level < c->relax_domain_level)
+ dattr->relax_domain_level = c->relax_domain_level;
+}
+
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+ struct cpuset *root_cs)
+{
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ /* skip the whole subtree if @cp doesn't have any CPU */
+ if (cpumask_empty(cp->cpus_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+ if (is_sched_load_balance(cp))
+ update_domain_attr(dattr, cp);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * cpuset1_generate_sched_domains()
+ *
+ * Finding the best partition (set of domains):
+ * The double nested loops below over i, j scan over the load
+ * balanced cpusets (using the array of cpuset pointers in csa[])
+ * looking for pairs of cpusets that have overlapping cpus_allowed
+ * and merging them using a union-find algorithm.
+ *
+ * The union of the cpus_allowed masks from the set of all cpusets
+ * having the same root then form the one element of the partition
+ * (one sched domain) to be passed to partition_sched_domains().
+ */
+int cpuset1_generate_sched_domains(cpumask_var_t **domains,
+ struct sched_domain_attr **attributes)
+{
+ struct cpuset *cp; /* top-down scan of cpusets */
+ struct cpuset **csa; /* array of all cpuset ptrs */
+ int csn; /* how many cpuset ptrs in csa so far */
+ int i, j; /* indices for partition finding loops */
+ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
+ struct sched_domain_attr *dattr; /* attributes for custom domains */
+ int ndoms = 0; /* number of sched domains in result */
+ int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup_subsys_state *pos_css;
+ int nslot_update;
+
+ lockdep_assert_cpuset_lock_held();
+
+ doms = NULL;
+ dattr = NULL;
+ csa = NULL;
+
+ /* Special case for the 99% of systems with one, full, sched domain */
+ if (is_sched_load_balance(&top_cpuset)) {
+ ndoms = 1;
+ doms = alloc_sched_domains(ndoms);
+ if (!doms)
+ goto done;
+
+ dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
+ if (dattr) {
+ *dattr = SD_ATTR_INIT;
+ update_domain_attr_tree(dattr, &top_cpuset);
+ }
+ cpumask_and(doms[0], top_cpuset.effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+
+ goto done;
+ }
+
+ csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
+ if (!csa)
+ goto done;
+ csn = 0;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
+ if (cp == &top_cpuset)
+ continue;
+
+ /*
+ * Continue traversing beyond @cp iff @cp has some CPUs and
+ * isn't load balancing. The former is obvious. The
+ * latter: All child cpusets contain a subset of the
+ * parent's cpus, so just skip them, and then we call
+ * update_domain_attr_tree() to calc relax_domain_level of
+ * the corresponding sched domain.
+ */
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ !(is_sched_load_balance(cp) &&
+ cpumask_intersects(cp->cpus_allowed,
+ housekeeping_cpumask(HK_TYPE_DOMAIN))))
+ continue;
+
+ if (is_sched_load_balance(cp) &&
+ !cpumask_empty(cp->effective_cpus))
+ csa[csn++] = cp;
+
+ /* skip @cp's subtree */
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ rcu_read_unlock();
+
+ for (i = 0; i < csn; i++)
+ uf_node_init(&csa[i]->node);
+
+ /* Merge overlapping cpusets */
+ for (i = 0; i < csn; i++) {
+ for (j = i + 1; j < csn; j++) {
+ if (cpusets_overlap(csa[i], csa[j]))
+ uf_union(&csa[i]->node, &csa[j]->node);
+ }
+ }
+
+ /* Count the total number of domains */
+ for (i = 0; i < csn; i++) {
+ if (uf_find(&csa[i]->node) == &csa[i]->node)
+ ndoms++;
+ }
+
+ /*
+ * Now we know how many domains to create.
+ * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
+ */
+ doms = alloc_sched_domains(ndoms);
+ if (!doms)
+ goto done;
+
+ /*
+ * The rest of the code, including the scheduler, can deal with
+ * dattr==NULL case. No need to abort if alloc fails.
+ */
+ dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
+ GFP_KERNEL);
+
+ for (nslot = 0, i = 0; i < csn; i++) {
+ nslot_update = 0;
+ for (j = i; j < csn; j++) {
+ if (uf_find(&csa[j]->node) == &csa[i]->node) {
+ struct cpumask *dp = doms[nslot];
+
+ if (i == j) {
+ nslot_update = 1;
+ cpumask_clear(dp);
+ if (dattr)
+ *(dattr + nslot) = SD_ATTR_INIT;
+ }
+ cpumask_or(dp, dp, csa[j]->effective_cpus);
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ if (dattr)
+ update_domain_attr_tree(dattr + nslot, csa[j]);
+ }
+ }
+ if (nslot_update)
+ nslot++;
+ }
+ BUG_ON(nslot != ndoms);
+
+done:
+ kfree(csa);
+
+ /*
+ * Fallback to the default domain if kmalloc() failed.
+ * See comments in partition_sched_domains().
+ */
+ if (doms == NULL)
+ ndoms = 1;
+
+ *domains = doms;
+ *attributes = dattr;
+ return ndoms;
+}
+
/*
* for the common functions, 'private' gives the type of file
*/
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index c06e2e96f79d..832179236529 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -26,7 +26,6 @@
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
-#include <linux/export.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/deadline.h>
@@ -86,12 +85,6 @@ static cpumask_var_t isolated_cpus;
static bool isolated_cpus_updating;
/*
- * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot
- */
-static cpumask_var_t boot_hk_cpus;
-static bool have_boot_isolcpus;
-
-/*
* A flag to force sched domain rebuild at the end of an operation.
* It can be set in
* - update_partition_sd_lb()
@@ -126,6 +119,17 @@ static bool force_sd_rebuild;
* For simplicity, a local partition can be created under a local or remote
* partition but a remote partition cannot have any partition root in its
* ancestor chain except the cgroup root.
+ *
+ * A valid partition can be formed by setting exclusive_cpus or cpus_allowed
+ * if exclusive_cpus is not set. In the case of partition with empty
+ * exclusive_cpus, all the conflicting exclusive CPUs specified in the
+ * following cpumasks of sibling cpusets will be removed from its
+ * cpus_allowed in determining its effective_xcpus.
+ * - effective_xcpus
+ * - exclusive_cpus
+ *
+ * The "cpuset.cpus.exclusive" control file should be used for setting up
+ * partition if the users want to get as many CPUs as possible.
*/
#define PRS_MEMBER 0
#define PRS_ROOT 1
@@ -208,12 +212,10 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs)
* If cpu_online_mask is used while a hotunplug operation is happening in
* parallel, we may leave an offline CPU in cpu_allowed or some other masks.
*/
-static struct cpuset top_cpuset = {
+struct cpuset top_cpuset = {
.flags = BIT(CS_CPU_EXCLUSIVE) |
BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
.partition_root_state = PRS_ROOT,
- .relax_domain_level = -1,
- .remote_partition = false,
};
/*
@@ -268,6 +270,11 @@ void cpuset_unlock(void)
mutex_unlock(&cpuset_mutex);
}
+void lockdep_assert_cpuset_lock_held(void)
+{
+ lockdep_assert_held(&cpuset_mutex);
+}
+
/**
* cpuset_full_lock - Acquire full protection for cpuset modification
*
@@ -286,6 +293,13 @@ void cpuset_full_unlock(void)
cpus_read_unlock();
}
+#ifdef CONFIG_LOCKDEP
+bool lockdep_is_cpuset_held(void)
+{
+ return lockdep_is_held(&cpuset_mutex);
+}
+#endif
+
static DEFINE_SPINLOCK(callback_lock);
void cpuset_callback_lock_irq(void)
@@ -319,7 +333,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes)
*/
static inline void dec_attach_in_progress_locked(struct cpuset *cs)
{
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
cs->attach_in_progress--;
if (!cs->attach_in_progress)
@@ -353,15 +367,6 @@ static inline bool is_in_v2_mode(void)
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
-static inline bool cpuset_is_populated(struct cpuset *cs)
-{
- lockdep_assert_held(&cpuset_mutex);
-
- /* Cpusets in the process of attaching should be considered as populated */
- return cgroup_is_populated(cs->css.cgroup) ||
- cs->attach_in_progress;
-}
-
/**
* partition_is_populated - check if partition has tasks
* @cs: partition root to be checked
@@ -453,9 +458,8 @@ static void guarantee_active_cpus(struct task_struct *tsk,
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
- while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
+ while (!nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]))
cs = parent_cs(cs);
- nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
}
/**
@@ -603,36 +607,32 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
/**
* cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts
- * @cs1: first cpuset to check
- * @cs2: second cpuset to check
+ * @trial: the trial cpuset to be checked
+ * @sibling: a sibling cpuset to be checked against
+ * @xcpus_changed: set if exclusive_cpus has been set
*
* Returns: true if CPU exclusivity conflict exists, false otherwise
*
* Conflict detection rules:
- * 1. If either cpuset is CPU exclusive, they must be mutually exclusive
- * 2. exclusive_cpus masks cannot intersect between cpusets
- * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs
+ * o cgroup v1
+ * See cpuset1_cpus_excl_conflict()
+ * o cgroup v2
+ * - The exclusive_cpus values cannot overlap.
+ * - New exclusive_cpus cannot be a superset of a sibling's cpus_allowed.
*/
-static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling,
+ bool xcpus_changed)
{
- /* If either cpuset is exclusive, check if they are mutually exclusive */
- if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
- return !cpusets_are_exclusive(cs1, cs2);
-
- /* Exclusive_cpus cannot intersect */
- if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus))
- return true;
-
- /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */
- if (!cpumask_empty(cs1->cpus_allowed) &&
- cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus))
- return true;
+ if (!cpuset_v2())
+ return cpuset1_cpus_excl_conflict(trial, sibling);
- if (!cpumask_empty(cs2->cpus_allowed) &&
- cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus))
+ /* The cpus_allowed of a sibling cpuset cannot be a subset of the new exclusive_cpus */
+ if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) &&
+ cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus))
return true;
- return false;
+ /* Exclusive_cpus cannot intersect */
+ return cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus);
}
static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
@@ -666,6 +666,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
struct cgroup_subsys_state *css;
struct cpuset *c, *par;
+ bool xcpus_changed;
int ret = 0;
rcu_read_lock();
@@ -682,20 +683,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
par = parent_cs(cur);
/*
- * Cpusets with tasks - existing or newly being attached - can't
- * be changed to have empty cpus_allowed or mems_allowed.
- */
- ret = -ENOSPC;
- if (cpuset_is_populated(cur)) {
- if (!cpumask_empty(cur->cpus_allowed) &&
- cpumask_empty(trial->cpus_allowed))
- goto out;
- if (!nodes_empty(cur->mems_allowed) &&
- nodes_empty(trial->mems_allowed))
- goto out;
- }
-
- /*
* We can't shrink if we won't have enough room for SCHED_DEADLINE
* tasks. This check is not done when scheduling is disabled as the
* users should know what they are doing.
@@ -722,10 +709,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
* overlap. exclusive_cpus cannot overlap with each other if set.
*/
ret = -EINVAL;
+ xcpus_changed = !cpumask_equal(cur->exclusive_cpus, trial->exclusive_cpus);
cpuset_for_each_child(c, css, par) {
if (c == cur)
continue;
- if (cpus_excl_conflict(trial, c))
+ if (cpus_excl_conflict(trial, c, xcpus_changed))
goto out;
if (mems_excl_conflict(trial, c))
goto out;
@@ -738,49 +726,6 @@ out:
}
#ifdef CONFIG_SMP
-/*
- * Helper routine for generate_sched_domains().
- * Do cpusets a, b have overlapping effective cpus_allowed masks?
- */
-static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
-{
- return cpumask_intersects(a->effective_cpus, b->effective_cpus);
-}
-
-static void
-update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
-{
- if (dattr->relax_domain_level < c->relax_domain_level)
- dattr->relax_domain_level = c->relax_domain_level;
- return;
-}
-
-static void update_domain_attr_tree(struct sched_domain_attr *dattr,
- struct cpuset *root_cs)
-{
- struct cpuset *cp;
- struct cgroup_subsys_state *pos_css;
-
- rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- /* skip the whole subtree if @cp doesn't have any CPU */
- if (cpumask_empty(cp->cpus_allowed)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
-
- if (is_sched_load_balance(cp))
- update_domain_attr(dattr, cp);
- }
- rcu_read_unlock();
-}
-
-/* Must be called with cpuset_mutex held. */
-static inline int nr_cpusets(void)
-{
- /* jump label reference count + the top-level cpuset */
- return static_key_count(&cpusets_enabled_key.key) + 1;
-}
/*
* generate_sched_domains()
@@ -820,103 +765,46 @@ static inline int nr_cpusets(void)
* convenient format, that can be easily compared to the prior
* value to determine what partition elements (sched domains)
* were changed (added or removed.)
- *
- * Finding the best partition (set of domains):
- * The double nested loops below over i, j scan over the load
- * balanced cpusets (using the array of cpuset pointers in csa[])
- * looking for pairs of cpusets that have overlapping cpus_allowed
- * and merging them using a union-find algorithm.
- *
- * The union of the cpus_allowed masks from the set of all cpusets
- * having the same root then form the one element of the partition
- * (one sched domain) to be passed to partition_sched_domains().
- *
*/
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
struct cpuset *cp; /* top-down scan of cpusets */
struct cpuset **csa; /* array of all cpuset ptrs */
- int csn; /* how many cpuset ptrs in csa so far */
int i, j; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
- int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
- bool root_load_balance = is_sched_load_balance(&top_cpuset);
- bool cgrpv2 = cpuset_v2();
- int nslot_update;
+
+ if (!cpuset_v2())
+ return cpuset1_generate_sched_domains(domains, attributes);
doms = NULL;
dattr = NULL;
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
- if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
-single_root_domain:
+ if (cpumask_empty(subpartitions_cpus)) {
ndoms = 1;
- doms = alloc_sched_domains(ndoms);
- if (!doms)
- goto done;
-
- dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
- if (dattr) {
- *dattr = SD_ATTR_INIT;
- update_domain_attr_tree(dattr, &top_cpuset);
- }
- cpumask_and(doms[0], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_TYPE_DOMAIN));
-
- goto done;
+ /* !csa will be checked and can be correctly handled */
+ goto generate_doms;
}
csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
if (!csa)
goto done;
- csn = 0;
+ /* Find how many partitions and cache them to csa[] */
rcu_read_lock();
- if (root_load_balance)
- csa[csn++] = &top_cpuset;
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
- if (cp == &top_cpuset)
- continue;
-
- if (cgrpv2)
- goto v2;
-
- /*
- * v1:
- * Continue traversing beyond @cp iff @cp has some CPUs and
- * isn't load balancing. The former is obvious. The
- * latter: All child cpusets contain a subset of the
- * parent's cpus, so just skip them, and then we call
- * update_domain_attr_tree() to calc relax_domain_level of
- * the corresponding sched domain.
- */
- if (!cpumask_empty(cp->cpus_allowed) &&
- !(is_sched_load_balance(cp) &&
- cpumask_intersects(cp->cpus_allowed,
- housekeeping_cpumask(HK_TYPE_DOMAIN))))
- continue;
-
- if (is_sched_load_balance(cp) &&
- !cpumask_empty(cp->effective_cpus))
- csa[csn++] = cp;
-
- /* skip @cp's subtree */
- pos_css = css_rightmost_descendant(pos_css);
- continue;
-
-v2:
/*
* Only valid partition roots that are not isolated and with
- * non-empty effective_cpus will be saved into csn[].
+ * non-empty effective_cpus will be saved into csa[].
*/
if ((cp->partition_root_state == PRS_ROOT) &&
!cpumask_empty(cp->effective_cpus))
- csa[csn++] = cp;
+ csa[ndoms++] = cp;
/*
* Skip @cp's subtree if not a partition root and has no
@@ -927,40 +815,18 @@ v2:
}
rcu_read_unlock();
- /*
- * If there are only isolated partitions underneath the cgroup root,
- * we can optimize out unneeded sched domains scanning.
- */
- if (root_load_balance && (csn == 1))
- goto single_root_domain;
-
- for (i = 0; i < csn; i++)
- uf_node_init(&csa[i]->node);
-
- /* Merge overlapping cpusets */
- for (i = 0; i < csn; i++) {
- for (j = i + 1; j < csn; j++) {
- if (cpusets_overlap(csa[i], csa[j])) {
+ for (i = 0; i < ndoms; i++) {
+ for (j = i + 1; j < ndoms; j++) {
+ if (cpusets_overlap(csa[i], csa[j]))
/*
* Cgroup v2 shouldn't pass down overlapping
* partition root cpusets.
*/
- WARN_ON_ONCE(cgrpv2);
- uf_union(&csa[i]->node, &csa[j]->node);
- }
+ WARN_ON_ONCE(1);
}
}
- /* Count the total number of domains */
- for (i = 0; i < csn; i++) {
- if (uf_find(&csa[i]->node) == &csa[i]->node)
- ndoms++;
- }
-
- /*
- * Now we know how many domains to create.
- * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
- */
+generate_doms:
doms = alloc_sched_domains(ndoms);
if (!doms)
goto done;
@@ -977,45 +843,19 @@ v2:
* to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
* subset of HK_TYPE_DOMAIN housekeeping CPUs.
*/
- if (cgrpv2) {
- for (i = 0; i < ndoms; i++) {
- /*
- * The top cpuset may contain some boot time isolated
- * CPUs that need to be excluded from the sched domain.
- */
- if (csa[i] == &top_cpuset)
- cpumask_and(doms[i], csa[i]->effective_cpus,
- housekeeping_cpumask(HK_TYPE_DOMAIN));
- else
- cpumask_copy(doms[i], csa[i]->effective_cpus);
- if (dattr)
- dattr[i] = SD_ATTR_INIT;
- }
- goto done;
- }
-
- for (nslot = 0, i = 0; i < csn; i++) {
- nslot_update = 0;
- for (j = i; j < csn; j++) {
- if (uf_find(&csa[j]->node) == &csa[i]->node) {
- struct cpumask *dp = doms[nslot];
-
- if (i == j) {
- nslot_update = 1;
- cpumask_clear(dp);
- if (dattr)
- *(dattr + nslot) = SD_ATTR_INIT;
- }
- cpumask_or(dp, dp, csa[j]->effective_cpus);
- cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
- if (dattr)
- update_domain_attr_tree(dattr + nslot, csa[j]);
- }
- }
- if (nslot_update)
- nslot++;
+ for (i = 0; i < ndoms; i++) {
+ /*
+ * The top cpuset may contain some boot time isolated
+ * CPUs that need to be excluded from the sched domain.
+ */
+ if (!csa || csa[i] == &top_cpuset)
+ cpumask_and(doms[i], top_cpuset.effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+ else
+ cpumask_copy(doms[i], csa[i]->effective_cpus);
+ if (dattr)
+ dattr[i] = SD_ATTR_INIT;
}
- BUG_ON(nslot != ndoms);
done:
kfree(csa);
@@ -1055,7 +895,7 @@ void dl_rebuild_rd_accounting(void)
int cpu;
u64 cookie = ++dl_cookie;
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
lockdep_assert_cpus_held();
lockdep_assert_held(&sched_domains_mutex);
@@ -1100,53 +940,33 @@ void dl_rebuild_rd_accounting(void)
*/
void rebuild_sched_domains_locked(void)
{
- struct cgroup_subsys_state *pos_css;
struct sched_domain_attr *attr;
cpumask_var_t *doms;
- struct cpuset *cs;
int ndoms;
+ int i;
lockdep_assert_cpus_held();
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
force_sd_rebuild = false;
- /*
- * If we have raced with CPU hotplug, return early to avoid
- * passing doms with offlined cpu to partition_sched_domains().
- * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
- *
- * With no CPUs in any subpartitions, top_cpuset's effective CPUs
- * should be the same as the active CPUs, so checking only top_cpuset
- * is enough to detect racing CPU offlines.
- */
- if (cpumask_empty(subpartitions_cpus) &&
- !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
- return;
+ /* Generate domain masks and attrs */
+ ndoms = generate_sched_domains(&doms, &attr);
/*
- * With subpartition CPUs, however, the effective CPUs of a partition
- * root should be only a subset of the active CPUs. Since a CPU in any
- * partition root could be offlined, all must be checked.
- */
- if (!cpumask_empty(subpartitions_cpus)) {
- rcu_read_lock();
- cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
- if (!is_partition_valid(cs)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
- if (!cpumask_subset(cs->effective_cpus,
- cpu_active_mask)) {
- rcu_read_unlock();
- return;
- }
- }
- rcu_read_unlock();
+ * cpuset_hotplug_workfn is invoked synchronously now, thus this
+ * function should not race with CPU hotplug. And the effective CPUs
+ * must not include any offline CPUs. Passing an offline CPU in the
+ * doms to partition_sched_domains() will trigger a kernel panic.
+ *
+ * We perform a final check here: if the doms contains any
+ * offline CPUs, a warning is emitted and we return directly to
+ * prevent the panic.
+ */
+ for (i = 0; i < ndoms; ++i) {
+ if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask)))
+ return;
}
- /* Generate domain masks and attrs */
- ndoms = generate_sched_domains(&doms, &attr);
-
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
}
@@ -1205,11 +1025,10 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
if (top_cs) {
/*
+ * PF_KTHREAD tasks are handled by housekeeping.
* PF_NO_SETAFFINITY tasks are ignored.
- * All per cpu kthreads should have PF_NO_SETAFFINITY
- * flag set, see kthread_set_per_cpu().
*/
- if (task->flags & PF_NO_SETAFFINITY)
+ if (task->flags & (PF_KTHREAD | PF_NO_SETAFFINITY))
continue;
cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
} else {
@@ -1450,15 +1269,16 @@ static bool isolated_cpus_can_update(struct cpumask *add_cpus,
* @new_cpus: cpu mask
* Return: true if there is conflict, false otherwise
*
- * CPUs outside of boot_hk_cpus, if defined, can only be used in an
+ * CPUs outside of HK_TYPE_DOMAIN_BOOT, if defined, can only be used in an
* isolated partition.
*/
static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
{
- if (!have_boot_isolcpus)
+ if (!housekeeping_enabled(HK_TYPE_DOMAIN_BOOT))
return false;
- if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus))
+ if ((prstate != PRS_ISOLATED) &&
+ !cpumask_subset(new_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT)))
return true;
return false;
@@ -1477,29 +1297,13 @@ static void update_isolation_cpumasks(void)
if (!isolated_cpus_updating)
return;
- lockdep_assert_cpus_held();
-
- ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
- WARN_ON_ONCE(ret < 0);
-
- ret = tmigr_isolated_exclude_cpumask(isolated_cpus);
+ ret = housekeeping_update(isolated_cpus);
WARN_ON_ONCE(ret < 0);
isolated_cpus_updating = false;
}
/**
- * cpuset_cpu_is_isolated - Check if the given CPU is isolated
- * @cpu: the CPU number to be checked
- * Return: true if CPU is used in an isolated partition, false otherwise
- */
-bool cpuset_cpu_is_isolated(int cpu)
-{
- return cpumask_test_cpu(cpu, isolated_cpus);
-}
-EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
-
-/**
* rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets
* @parent: Parent cpuset containing all siblings
* @cs: Current cpuset (will be skipped)
@@ -1517,23 +1321,29 @@ static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs,
int retval = 0;
if (cpumask_empty(excpus))
- return retval;
+ return 0;
/*
- * Exclude exclusive CPUs from siblings
+ * Remove exclusive CPUs from siblings
*/
rcu_read_lock();
cpuset_for_each_child(sibling, css, parent) {
+ struct cpumask *sibling_xcpus;
+
if (sibling == cs)
continue;
- if (cpumask_intersects(excpus, sibling->exclusive_cpus)) {
- cpumask_andnot(excpus, excpus, sibling->exclusive_cpus);
- retval++;
- continue;
- }
- if (cpumask_intersects(excpus, sibling->effective_xcpus)) {
- cpumask_andnot(excpus, excpus, sibling->effective_xcpus);
+ /*
+ * If exclusive_cpus is defined, effective_xcpus will always
+ * be a subset. Otherwise, effective_xcpus will only be set
+ * in a valid partition root.
+ */
+ sibling_xcpus = cpumask_empty(sibling->exclusive_cpus)
+ ? sibling->effective_xcpus
+ : sibling->exclusive_cpus;
+
+ if (cpumask_intersects(excpus, sibling_xcpus)) {
+ cpumask_andnot(excpus, excpus, sibling_xcpus);
retval++;
}
}
@@ -1822,7 +1632,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
int parent_prs = parent->partition_root_state;
bool nocpu;
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */
/*
@@ -2331,17 +2141,13 @@ get_css:
spin_lock_irq(&callback_lock);
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
cp->partition_root_state = new_prs;
- if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs))
- compute_excpus(cp, cp->effective_xcpus);
-
/*
- * Make sure effective_xcpus is properly set for a valid
- * partition root.
+ * Need to compute effective_xcpus if either exclusive_cpus
+ * is non-empty or it is a valid partition root.
*/
- if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
- cpumask_and(cp->effective_xcpus,
- cp->cpus_allowed, parent->effective_xcpus);
- else if (new_prs < 0)
+ if ((new_prs > 0) || !cpumask_empty(cp->exclusive_cpus))
+ compute_excpus(cp, cp->effective_xcpus);
+ if (new_prs <= 0)
reset_partition_data(cp);
spin_unlock_irq(&callback_lock);
@@ -2394,7 +2200,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
/*
* Check all its siblings and call update_cpumasks_hier()
@@ -2403,27 +2209,20 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
* It is possible a change in parent's effective_cpus
* due to a change in a child partition's effective_xcpus will impact
* its siblings even if they do not inherit parent's effective_cpus
- * directly.
+ * directly. It should not impact valid partition.
*
* The update_cpumasks_hier() function may sleep. So we have to
* release the RCU read lock before calling it.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
- if (sibling == cs)
+ if (sibling == cs || is_partition_valid(sibling))
continue;
- if (!is_partition_valid(sibling)) {
- compute_effective_cpumask(tmp->new_cpus, sibling,
- parent);
- if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
- continue;
- } else if (is_remote_partition(sibling)) {
- /*
- * Change in a sibling cpuset won't affect a remote
- * partition root.
- */
+
+ compute_effective_cpumask(tmp->new_cpus, sibling,
+ parent);
+ if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
continue;
- }
if (!css_tryget_online(&sibling->css))
continue;
@@ -2479,43 +2278,6 @@ static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *tri
return PERR_NONE;
}
-static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs,
- struct tmpmasks *tmp)
-{
- int retval;
- struct cpuset *parent = parent_cs(cs);
-
- retval = validate_change(cs, trialcs);
-
- if ((retval == -EINVAL) && cpuset_v2()) {
- struct cgroup_subsys_state *css;
- struct cpuset *cp;
-
- /*
- * The -EINVAL error code indicates that partition sibling
- * CPU exclusivity rule has been violated. We still allow
- * the cpumask change to proceed while invalidating the
- * partition. However, any conflicting sibling partitions
- * have to be marked as invalid too.
- */
- trialcs->prs_err = PERR_NOTEXCL;
- rcu_read_lock();
- cpuset_for_each_child(cp, css, parent) {
- struct cpumask *xcpus = user_xcpus(trialcs);
-
- if (is_partition_valid(cp) &&
- cpumask_intersects(xcpus, cp->effective_xcpus)) {
- rcu_read_unlock();
- update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp);
- rcu_read_lock();
- }
- }
- rcu_read_unlock();
- retval = 0;
- }
- return retval;
-}
-
/**
* partition_cpus_change - Handle partition state changes due to CPU mask updates
* @cs: The target cpuset being modified
@@ -2575,15 +2337,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
- if (alloc_tmpmasks(&tmp))
- return -ENOMEM;
-
compute_trialcs_excpus(trialcs, cs);
trialcs->prs_err = PERR_NONE;
- retval = cpus_allowed_validate_change(cs, trialcs, &tmp);
+ retval = validate_change(cs, trialcs);
if (retval < 0)
- goto out_free;
+ return retval;
+
+ if (alloc_tmpmasks(&tmp))
+ return -ENOMEM;
/*
* Check all the descendants in update_cpumasks_hier() if
@@ -2606,7 +2368,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
if (cs->partition_root_state)
update_partition_sd_lb(cs, old_prs);
-out_free:
+
free_tmpmasks(&tmp);
return retval;
}
@@ -2859,13 +2621,13 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
- nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
+ bool has_mems = nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
/*
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some MEMs.
*/
- if (is_in_v2_mode() && nodes_empty(*new_mems))
+ if (is_in_v2_mode() && !has_mems)
*new_mems = parent->effective_mems;
/* Skip the whole subtree if the nodemask remains the same. */
@@ -3265,7 +3027,7 @@ static nodemask_t cpuset_attach_nodemask_to;
static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
{
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
if (cs != &top_cpuset)
guarantee_active_cpus(task, cpus_attach);
@@ -3621,8 +3383,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM);
__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
- fmeter_init(&cs->fmeter);
- cs->relax_domain_level = -1;
+ cpuset1_init(cs);
/* Set CS_MEMORY_MIGRATE for default hierarchy */
if (cpuset_v2())
@@ -3635,17 +3396,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
struct cpuset *parent = parent_cs(cs);
- struct cpuset *tmp_cs;
- struct cgroup_subsys_state *pos_css;
if (!parent)
return 0;
cpuset_full_lock();
- if (is_spread_page(parent))
- set_bit(CS_SPREAD_PAGE, &cs->flags);
- if (is_spread_slab(parent))
- set_bit(CS_SPREAD_SLAB, &cs->flags);
/*
* For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
*/
@@ -3660,39 +3415,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->effective_mems = parent->effective_mems;
}
spin_unlock_irq(&callback_lock);
+ cpuset1_online_css(css);
- if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
- goto out_unlock;
-
- /*
- * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
- * set. This flag handling is implemented in cgroup core for
- * historical reasons - the flag may be specified during mount.
- *
- * Currently, if any sibling cpusets have exclusive cpus or mem, we
- * refuse to clone the configuration - thereby refusing the task to
- * be entered, and as a result refusing the sys_unshare() or
- * clone() which initiated it. If this becomes a problem for some
- * users who wish to allow that scenario, then this could be
- * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
- * (and likewise for mems) to the new cgroup.
- */
- rcu_read_lock();
- cpuset_for_each_child(tmp_cs, pos_css, parent) {
- if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
- rcu_read_unlock();
- goto out_unlock;
- }
- }
- rcu_read_unlock();
-
- spin_lock_irq(&callback_lock);
- cs->mems_allowed = parent->mems_allowed;
- cs->effective_mems = parent->mems_allowed;
- cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
- cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
- spin_unlock_irq(&callback_lock);
-out_unlock:
cpuset_full_unlock();
return 0;
}
@@ -3892,16 +3616,13 @@ int __init cpuset_init(void)
cpumask_setall(top_cpuset.exclusive_cpus);
nodes_setall(top_cpuset.effective_mems);
- fmeter_init(&top_cpuset.fmeter);
+ cpuset1_init(&top_cpuset);
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
- have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN);
- if (have_boot_isolcpus) {
- BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL));
- cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN));
- cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus);
- }
+ if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT))
+ cpumask_andnot(isolated_cpus, cpu_possible_mask,
+ housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT));
return 0;
}
@@ -4229,7 +3950,7 @@ static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask
*/
void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
{
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
__cpuset_cpus_allowed_locked(tsk, pmask);
}
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 81ea38dd6f9d..a5490097fe52 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -230,7 +230,7 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
}
static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
- u16 mask)
+ u32 mask)
{
struct cgroup_subsys *ss;
int ssid;
diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c
index e12b946278b6..1ea6afffa985 100644
--- a/kernel/cgroup/dmem.c
+++ b/kernel/cgroup/dmem.c
@@ -14,6 +14,7 @@
#include <linux/mutex.h>
#include <linux/page_counter.h>
#include <linux/parser.h>
+#include <linux/refcount.h>
#include <linux/rculist.h>
#include <linux/slab.h>
@@ -71,7 +72,9 @@ struct dmem_cgroup_pool_state {
struct rcu_head rcu;
struct page_counter cnt;
+ struct dmem_cgroup_pool_state *parent;
+ refcount_t ref;
bool inited;
};
@@ -88,6 +91,9 @@ struct dmem_cgroup_pool_state {
static DEFINE_SPINLOCK(dmemcg_lock);
static LIST_HEAD(dmem_cgroup_regions);
+static void dmemcg_free_region(struct kref *ref);
+static void dmemcg_pool_free_rcu(struct rcu_head *rcu);
+
static inline struct dmemcg_state *
css_to_dmemcs(struct cgroup_subsys_state *css)
{
@@ -104,10 +110,38 @@ static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg)
return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL;
}
+static void dmemcg_pool_get(struct dmem_cgroup_pool_state *pool)
+{
+ refcount_inc(&pool->ref);
+}
+
+static bool dmemcg_pool_tryget(struct dmem_cgroup_pool_state *pool)
+{
+ return refcount_inc_not_zero(&pool->ref);
+}
+
+static void dmemcg_pool_put(struct dmem_cgroup_pool_state *pool)
+{
+ if (!refcount_dec_and_test(&pool->ref))
+ return;
+
+ call_rcu(&pool->rcu, dmemcg_pool_free_rcu);
+}
+
+static void dmemcg_pool_free_rcu(struct rcu_head *rcu)
+{
+ struct dmem_cgroup_pool_state *pool = container_of(rcu, typeof(*pool), rcu);
+
+ if (pool->parent)
+ dmemcg_pool_put(pool->parent);
+ kref_put(&pool->region->ref, dmemcg_free_region);
+ kfree(pool);
+}
+
static void free_cg_pool(struct dmem_cgroup_pool_state *pool)
{
list_del(&pool->region_node);
- kfree(pool);
+ dmemcg_pool_put(pool);
}
static void
@@ -342,6 +376,12 @@ alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region
page_counter_init(&pool->cnt,
ppool ? &ppool->cnt : NULL, true);
reset_all_resource_limits(pool);
+ refcount_set(&pool->ref, 1);
+ kref_get(&region->ref);
+ if (ppool && !pool->parent) {
+ pool->parent = ppool;
+ dmemcg_pool_get(ppool);
+ }
list_add_tail_rcu(&pool->css_node, &dmemcs->pools);
list_add_tail(&pool->region_node, &region->pools);
@@ -389,6 +429,10 @@ get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *regio
/* Fix up parent links, mark as inited. */
pool->cnt.parent = &ppool->cnt;
+ if (ppool && !pool->parent) {
+ pool->parent = ppool;
+ dmemcg_pool_get(ppool);
+ }
pool->inited = true;
pool = ppool;
@@ -423,7 +467,7 @@ static void dmemcg_free_region(struct kref *ref)
*/
void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region)
{
- struct list_head *entry;
+ struct dmem_cgroup_pool_state *pool, *next;
if (!region)
return;
@@ -433,11 +477,10 @@ void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region)
/* Remove from global region list */
list_del_rcu(&region->region_node);
- list_for_each_rcu(entry, &region->pools) {
- struct dmem_cgroup_pool_state *pool =
- container_of(entry, typeof(*pool), region_node);
-
+ list_for_each_entry_safe(pool, next, &region->pools, region_node) {
list_del_rcu(&pool->css_node);
+ list_del(&pool->region_node);
+ dmemcg_pool_put(pool);
}
/*
@@ -518,8 +561,10 @@ static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name)
*/
void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool)
{
- if (pool)
+ if (pool) {
css_put(&pool->cs->css);
+ dmemcg_pool_put(pool);
+ }
}
EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put);
@@ -533,6 +578,8 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
pool = find_cg_pool_locked(cg, region);
if (pool && !READ_ONCE(pool->inited))
pool = NULL;
+ if (pool && !dmemcg_pool_tryget(pool))
+ pool = NULL;
rcu_read_unlock();
while (!pool) {
@@ -541,6 +588,8 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
pool = get_cg_pool_locked(cg, region, &allocpool);
else
pool = ERR_PTR(-ENODEV);
+ if (!IS_ERR(pool))
+ dmemcg_pool_get(pool);
spin_unlock(&dmemcg_lock);
if (pool == ERR_PTR(-ENOMEM)) {
@@ -576,6 +625,7 @@ void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size)
page_counter_uncharge(&pool->cnt, size);
css_put(&pool->cs->css);
+ dmemcg_pool_put(pool);
}
EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge);
@@ -627,7 +677,9 @@ int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size,
if (ret_limit_pool) {
*ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt);
css_get(&(*ret_limit_pool)->cs->css);
+ dmemcg_pool_get(*ret_limit_pool);
}
+ dmemcg_pool_put(pool);
ret = -EAGAIN;
goto err;
}
@@ -700,6 +752,9 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
if (!region_name[0])
continue;
+ if (!options || !*options)
+ return -EINVAL;
+
rcu_read_lock();
region = dmemcg_get_region_by_name(region_name);
rcu_read_unlock();
@@ -719,6 +774,7 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
/* And commit */
apply(pool, new_limit);
+ dmemcg_pool_put(pool);
out_put:
kref_put(&region->ref, dmemcg_free_region);
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index 9f6ab7dabf67..774702591d26 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -84,7 +84,7 @@ CONFIG_SLUB_DEBUG_ON=y
# Debug Oops, Lockups and Hangs
#
CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0
-# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0
CONFIG_DEBUG_ATOMIC_SLEEP=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_PANIC_ON_OOPS=y
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index fb5be6e9b423..a743e7ffa6c0 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -54,24 +54,6 @@ static __always_inline void rcu_task_enter(void)
#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
}
-/* Turn on heavyweight RCU tasks trace readers on kernel exit. */
-static __always_inline void rcu_task_trace_heavyweight_enter(void)
-{
-#ifdef CONFIG_TASKS_TRACE_RCU
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
- current->trc_reader_special.b.need_mb = true;
-#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
-}
-
-/* Turn off heavyweight RCU tasks trace readers on kernel entry. */
-static __always_inline void rcu_task_trace_heavyweight_exit(void)
-{
-#ifdef CONFIG_TASKS_TRACE_RCU
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
- current->trc_reader_special.b.need_mb = false;
-#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
-}
-
/*
* Record entry into an extended quiescent state. This is only to be
* called when not already in an extended quiescent state, that is,
@@ -85,7 +67,6 @@ static noinstr void ct_kernel_exit_state(int offset)
* critical sections, and we also must force ordering with the
* next idle sojourn.
*/
- rcu_task_trace_heavyweight_enter(); // Before CT state update!
// RCU is still watching. Better not be in extended quiescent state!
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !rcu_is_watching_curr_cpu());
(void)ct_state_inc(offset);
@@ -108,7 +89,6 @@ static noinstr void ct_kernel_enter_state(int offset)
*/
seq = ct_state_inc(offset);
// RCU is now watching. Better not be in an extended quiescent state!
- rcu_task_trace_heavyweight_exit(); // After CT state update!
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & CT_RCU_WATCHING));
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8df2d773fe3b..01968a5c4a16 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -534,6 +534,11 @@ int lockdep_is_cpus_held(void)
{
return percpu_rwsem_is_held(&cpu_hotplug_lock);
}
+
+int lockdep_is_cpus_write_held(void)
+{
+ return percpu_rwsem_is_write_held(&cpu_hotplug_lock);
+}
#endif
static void lockdep_acquire_cpus_lock(void)
@@ -1410,6 +1415,16 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
cpus_write_lock();
+ /*
+ * Keep at least one housekeeping cpu onlined to avoid generating
+ * an empty sched_domain span.
+ */
+ if (cpumask_any_and(cpu_online_mask,
+ housekeeping_cpumask(HK_TYPE_DOMAIN)) >= nr_cpu_ids) {
+ ret = -EBUSY;
+ goto out;
+ }
+
cpuhp_tasks_frozen = tasks_frozen;
prev_state = cpuhp_set_state(cpu, st, target);
@@ -1456,22 +1471,8 @@ out:
return ret;
}
-struct cpu_down_work {
- unsigned int cpu;
- enum cpuhp_state target;
-};
-
-static long __cpu_down_maps_locked(void *arg)
-{
- struct cpu_down_work *work = arg;
-
- return _cpu_down(work->cpu, 0, work->target);
-}
-
static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
{
- struct cpu_down_work work = { .cpu = cpu, .target = target, };
-
/*
* If the platform does not support hotplug, report it explicitly to
* differentiate it from a transient offlining failure.
@@ -1480,18 +1481,7 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
return -EOPNOTSUPP;
if (cpu_hotplug_disabled)
return -EBUSY;
-
- /*
- * Ensure that the control task does not run on the to be offlined
- * CPU to prevent a deadlock against cfs_b->period_timer.
- * Also keep at least one housekeeping cpu onlined to avoid generating
- * an empty sched_domain span.
- */
- for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) {
- if (cpu != work.cpu)
- return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
- }
- return -EBUSY;
+ return _cpu_down(cpu, 0, target);
}
static int cpu_down(unsigned int cpu, enum cpuhp_state target)
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 99dac1aa972a..3952b3e102e0 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -44,9 +44,15 @@ note_buf_t __percpu *crash_notes;
int kimage_crash_copy_vmcoreinfo(struct kimage *image)
{
- struct page *vmcoreinfo_page;
+ struct page *vmcoreinfo_base;
+ struct page *vmcoreinfo_pages[DIV_ROUND_UP(VMCOREINFO_BYTES, PAGE_SIZE)];
+ unsigned int order, nr_pages;
+ int i;
void *safecopy;
+ nr_pages = DIV_ROUND_UP(VMCOREINFO_BYTES, PAGE_SIZE);
+ order = get_order(VMCOREINFO_BYTES);
+
if (!IS_ENABLED(CONFIG_CRASH_DUMP))
return 0;
if (image->type != KEXEC_TYPE_CRASH)
@@ -61,12 +67,15 @@ int kimage_crash_copy_vmcoreinfo(struct kimage *image)
* happens to generate vmcoreinfo note, hereby we rely on
* vmap for this purpose.
*/
- vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
- if (!vmcoreinfo_page) {
+ vmcoreinfo_base = kimage_alloc_control_pages(image, order);
+ if (!vmcoreinfo_base) {
pr_warn("Could not allocate vmcoreinfo buffer\n");
return -ENOMEM;
}
- safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
+ for (i = 0; i < nr_pages; i++)
+ vmcoreinfo_pages[i] = vmcoreinfo_base + i;
+
+ safecopy = vmap(vmcoreinfo_pages, nr_pages, VM_MAP, PAGE_KERNEL);
if (!safecopy) {
pr_warn("Could not vmap vmcoreinfo buffer\n");
return -ENOMEM;
diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c
index 401423ba477d..37129243054d 100644
--- a/kernel/crash_dump_dm_crypt.c
+++ b/kernel/crash_dump_dm_crypt.c
@@ -143,6 +143,7 @@ static int read_key_from_user_keying(struct dm_crypt_key *dm_key)
{
const struct user_key_payload *ukp;
struct key *key;
+ int ret = 0;
kexec_dprintk("Requesting logon key %s", dm_key->key_desc);
key = request_key(&key_type_logon, dm_key->key_desc, NULL);
@@ -152,20 +153,28 @@ static int read_key_from_user_keying(struct dm_crypt_key *dm_key)
return PTR_ERR(key);
}
+ down_read(&key->sem);
ukp = user_key_payload_locked(key);
- if (!ukp)
- return -EKEYREVOKED;
+ if (!ukp) {
+ ret = -EKEYREVOKED;
+ goto out;
+ }
if (ukp->datalen > KEY_SIZE_MAX) {
pr_err("Key size %u exceeds maximum (%u)\n", ukp->datalen, KEY_SIZE_MAX);
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
memcpy(dm_key->data, ukp->data, ukp->datalen);
dm_key->key_size = ukp->datalen;
kexec_dprintk("Get dm crypt key (size=%u) %s: %8ph\n", dm_key->key_size,
dm_key->key_desc, dm_key->data);
- return 0;
+
+out:
+ up_read(&key->sem);
+ key_put(key);
+ return ret;
}
struct config_key {
@@ -223,7 +232,7 @@ static void config_key_release(struct config_item *item)
key_count--;
}
-static struct configfs_item_operations config_key_item_ops = {
+static const struct configfs_item_operations config_key_item_ops = {
.release = config_key_release,
};
@@ -298,7 +307,7 @@ static struct configfs_attribute *config_keys_attrs[] = {
* Note that, since no extra work is required on ->drop_item(),
* no ->drop_item() is provided.
*/
-static struct configfs_group_operations config_keys_group_ops = {
+static const struct configfs_group_operations config_keys_group_ops = {
.make_item = config_keys_make_item,
};
diff --git a/kernel/cred.c b/kernel/cred.c
index a6f686b30da1..12a7b1ce5131 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -621,29 +621,6 @@ int set_security_override(struct cred *new, u32 secid)
EXPORT_SYMBOL(set_security_override);
/**
- * set_security_override_from_ctx - Set the security ID in a set of credentials
- * @new: The credentials to alter
- * @secctx: The LSM security context to generate the security ID from.
- *
- * Set the LSM security ID in a set of credentials so that the subjective
- * security is overridden when an alternative set of credentials is used. The
- * security ID is specified in string form as a security context to be
- * interpreted by the LSM.
- */
-int set_security_override_from_ctx(struct cred *new, const char *secctx)
-{
- u32 secid;
- int ret;
-
- ret = security_secctx_to_secid(secctx, strlen(secctx), &secid);
- if (ret < 0)
- return ret;
-
- return set_security_override(new, secid);
-}
-EXPORT_SYMBOL(set_security_override_from_ctx);
-
-/**
* set_create_files_as - Set the LSM file create context in a set of credentials
* @new: The credentials to alter
* @inode: The inode to take the context from
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 22fe969c5d2e..f586afd76c80 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -27,6 +27,7 @@
#include <linux/kernel.h>
#include <linux/sched/signal.h>
+#include <linux/hex.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
#include <linux/serial_core.h>
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 30e7912ebb0d..2e55c493c98b 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -18,6 +18,8 @@
do { \
d->type##_delay_max = tsk->delays->type##_delay_max; \
d->type##_delay_min = tsk->delays->type##_delay_min; \
+ d->type##_delay_max_ts.tv_sec = tsk->delays->type##_delay_max_ts.tv_sec; \
+ d->type##_delay_max_ts.tv_nsec = tsk->delays->type##_delay_max_ts.tv_nsec; \
tmp = d->type##_delay_total + tsk->delays->type##_delay; \
d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \
d->type##_count += tsk->delays->type##_count; \
@@ -104,7 +106,8 @@ void __delayacct_tsk_init(struct task_struct *tsk)
* Finish delay accounting for a statistic using its timestamps (@start),
* accumulator (@total) and @count
*/
-static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, u64 *max, u64 *min)
+static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count,
+ u64 *max, u64 *min, struct timespec64 *ts)
{
s64 ns = local_clock() - *start;
unsigned long flags;
@@ -113,8 +116,10 @@ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *cou
raw_spin_lock_irqsave(lock, flags);
*total += ns;
(*count)++;
- if (ns > *max)
+ if (ns > *max) {
*max = ns;
+ ktime_get_real_ts64(ts);
+ }
if (*min == 0 || ns < *min)
*min = ns;
raw_spin_unlock_irqrestore(lock, flags);
@@ -137,7 +142,8 @@ void __delayacct_blkio_end(struct task_struct *p)
&p->delays->blkio_delay,
&p->delays->blkio_count,
&p->delays->blkio_delay_max,
- &p->delays->blkio_delay_min);
+ &p->delays->blkio_delay_min,
+ &p->delays->blkio_delay_max_ts);
}
int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -170,6 +176,8 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->cpu_delay_max = tsk->sched_info.max_run_delay;
d->cpu_delay_min = tsk->sched_info.min_run_delay;
+ d->cpu_delay_max_ts.tv_sec = tsk->sched_info.max_run_delay_ts.tv_sec;
+ d->cpu_delay_max_ts.tv_nsec = tsk->sched_info.max_run_delay_ts.tv_nsec;
tmp = (s64)d->cpu_delay_total + t2;
d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
tmp = (s64)d->cpu_run_virtual_total + t3;
@@ -217,7 +225,8 @@ void __delayacct_freepages_end(void)
&current->delays->freepages_delay,
&current->delays->freepages_count,
&current->delays->freepages_delay_max,
- &current->delays->freepages_delay_min);
+ &current->delays->freepages_delay_min,
+ &current->delays->freepages_delay_max_ts);
}
void __delayacct_thrashing_start(bool *in_thrashing)
@@ -241,7 +250,8 @@ void __delayacct_thrashing_end(bool *in_thrashing)
&current->delays->thrashing_delay,
&current->delays->thrashing_count,
&current->delays->thrashing_delay_max,
- &current->delays->thrashing_delay_min);
+ &current->delays->thrashing_delay_min,
+ &current->delays->thrashing_delay_max_ts);
}
void __delayacct_swapin_start(void)
@@ -256,7 +266,8 @@ void __delayacct_swapin_end(void)
&current->delays->swapin_delay,
&current->delays->swapin_count,
&current->delays->swapin_delay_max,
- &current->delays->swapin_delay_min);
+ &current->delays->swapin_delay_min,
+ &current->delays->swapin_delay_max_ts);
}
void __delayacct_compact_start(void)
@@ -271,7 +282,8 @@ void __delayacct_compact_end(void)
&current->delays->compact_delay,
&current->delays->compact_count,
&current->delays->compact_delay_max,
- &current->delays->compact_delay_min);
+ &current->delays->compact_delay_min,
+ &current->delays->compact_delay_max_ts);
}
void __delayacct_wpcopy_start(void)
@@ -286,7 +298,8 @@ void __delayacct_wpcopy_end(void)
&current->delays->wpcopy_delay,
&current->delays->wpcopy_count,
&current->delays->wpcopy_delay_max,
- &current->delays->wpcopy_delay_min);
+ &current->delays->wpcopy_delay_min,
+ &current->delays->wpcopy_delay_max_ts);
}
void __delayacct_irq(struct task_struct *task, u32 delta)
@@ -296,8 +309,10 @@ void __delayacct_irq(struct task_struct *task, u32 delta)
raw_spin_lock_irqsave(&task->delays->lock, flags);
task->delays->irq_delay += delta;
task->delays->irq_count++;
- if (delta > task->delays->irq_delay_max)
+ if (delta > task->delays->irq_delay_max) {
task->delays->irq_delay_max = delta;
+ ktime_get_real_ts64(&task->delays->irq_delay_max_ts);
+ }
if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min))
task->delays->irq_delay_min = delta;
raw_spin_unlock_irqrestore(&task->delays->lock, flags);
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index d8fd6f779f79..c56004d314dc 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -91,6 +91,16 @@ static int __init early_cma(char *p)
}
early_param("cma", early_cma);
+/*
+ * cma_skip_dt_default_reserved_mem - This is called from the
+ * reserved_mem framework to detect if the default cma region is being
+ * set by the "cma=" kernel parameter.
+ */
+bool __init cma_skip_dt_default_reserved_mem(void)
+{
+ return size_cmdline != -1;
+}
+
#ifdef CONFIG_DMA_NUMA_CMA
static struct cma *dma_contiguous_numa_area[MAX_NUMNODES];
@@ -247,10 +257,12 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
pr_debug("%s: reserving %ld MiB for global area\n", __func__,
(unsigned long)selected_size / SZ_1M);
- dma_contiguous_reserve_area(selected_size, selected_base,
- selected_limit,
- &dma_contiguous_default_area,
- fixed);
+ ret = dma_contiguous_reserve_area(selected_size, selected_base,
+ selected_limit,
+ &dma_contiguous_default_area,
+ fixed);
+ if (ret)
+ return;
ret = dma_heap_cma_register_heap(dma_contiguous_default_area);
if (ret)
@@ -470,12 +482,6 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
struct cma *cma;
int err;
- if (size_cmdline != -1 && default_cma) {
- pr_info("Reserved memory: bypass %s node, using cmdline CMA params instead\n",
- rmem->name);
- return -EBUSY;
- }
-
if (!of_get_flat_dt_prop(node, "reusable", NULL) ||
of_get_flat_dt_prop(node, "no-map", NULL))
return -EINVAL;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 37163eb49f9f..ee29c47781e3 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -638,7 +638,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
/* let the implementation decide on the zone to allocate from: */
flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
- if (dma_alloc_direct(dev, ops)) {
+ if (dma_alloc_direct(dev, ops) || arch_dma_alloc_direct(dev)) {
cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
} else if (use_dma_iommu(dev)) {
cpu_addr = iommu_dma_alloc(dev, size, dma_handle, flag, attrs);
@@ -679,7 +679,7 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
return;
debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
- if (dma_alloc_direct(dev, ops))
+ if (dma_alloc_direct(dev, ops) || arch_dma_free_direct(dev, dma_handle))
dma_direct_free(dev, size, cpu_addr, dma_handle, attrs);
else if (use_dma_iommu(dev))
iommu_dma_free(dev, size, cpu_addr, dma_handle, attrs);
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 26392badc36b..2b2fbb709242 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -184,6 +184,12 @@ static __init struct gen_pool *__dma_atomic_pool_init(size_t pool_size,
return pool;
}
+#ifdef CONFIG_ZONE_DMA32
+#define has_managed_dma32 has_managed_zone(ZONE_DMA32)
+#else
+#define has_managed_dma32 false
+#endif
+
static int __init dma_atomic_pool_init(void)
{
int ret = 0;
@@ -199,17 +205,20 @@ static int __init dma_atomic_pool_init(void)
}
INIT_WORK(&atomic_pool_work, atomic_pool_work_fn);
- atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size,
+ /* All memory might be in the DMA zone(s) to begin with */
+ if (has_managed_zone(ZONE_NORMAL)) {
+ atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size,
GFP_KERNEL);
- if (!atomic_pool_kernel)
- ret = -ENOMEM;
+ if (!atomic_pool_kernel)
+ ret = -ENOMEM;
+ }
if (has_managed_dma()) {
atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size,
GFP_KERNEL | GFP_DMA);
if (!atomic_pool_dma)
ret = -ENOMEM;
}
- if (IS_ENABLED(CONFIG_ZONE_DMA32)) {
+ if (has_managed_dma32) {
atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
GFP_KERNEL | GFP_DMA32);
if (!atomic_pool_dma32)
@@ -224,11 +233,11 @@ postcore_initcall(dma_atomic_pool_init);
static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
{
if (prev == NULL) {
- if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
- return atomic_pool_dma32;
- if (atomic_pool_dma && (gfp & GFP_DMA))
- return atomic_pool_dma;
- return atomic_pool_kernel;
+ if (gfp & GFP_DMA)
+ return atomic_pool_dma ?: atomic_pool_dma32 ?: atomic_pool_kernel;
+ if (gfp & GFP_DMA32)
+ return atomic_pool_dma32 ?: atomic_pool_dma ?: atomic_pool_kernel;
+ return atomic_pool_kernel ?: atomic_pool_dma32 ?: atomic_pool_dma;
}
if (prev == atomic_pool_kernel)
return atomic_pool_dma32 ? atomic_pool_dma32 : atomic_pool_dma;
@@ -268,15 +277,20 @@ struct page *dma_alloc_from_pool(struct device *dev, size_t size,
{
struct gen_pool *pool = NULL;
struct page *page;
+ bool pool_found = false;
while ((pool = dma_guess_pool(pool, gfp))) {
+ pool_found = true;
page = __dma_alloc_from_pool(dev, size, pool, cpu_addr,
phys_addr_ok);
if (page)
return page;
}
- WARN(1, "Failed to get suitable pool for %s\n", dev_name(dev));
+ if (pool_found)
+ WARN(!(gfp & __GFP_NOWARN), "DMA pool exhausted for %s\n", dev_name(dev));
+ else
+ WARN(1, "Failed to get suitable pool for %s\n", dev_name(dev));
return NULL;
}
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 5c792b30c58a..9ef63e414791 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -17,6 +17,27 @@ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK)
#endif
+/* TIF bits, which prevent a time slice extension. */
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Since rseq slice ext has a direct correlation to the worst case
+ * scheduling latency (schedule is delayed after all), only have it affect
+ * LAZY reschedules on PREEMPT_RT for now.
+ *
+ * However, since this delay is only applicable to userspace, a value
+ * for rseq_slice_extension_nsec that is strictly less than the worst case
+ * kernel space preempt_disable() region, should mean the scheduling latency
+ * is not affected, even for !LAZY.
+ *
+ * However, since this value depends on the hardware at hand, it cannot be
+ * pre-determined in any sensible way. Hence punt on this problem for now.
+ */
+# define TIF_SLICE_EXT_SCHED (_TIF_NEED_RESCHED_LAZY)
+#else
+# define TIF_SLICE_EXT_SCHED (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
+#endif
+#define TIF_SLICE_EXT_DENY (EXIT_TO_USER_MODE_WORK & ~TIF_SLICE_EXT_SCHED)
+
static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
{
@@ -28,8 +49,10 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re
local_irq_enable_exit_to_user(ti_work);
- if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
- schedule();
+ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
+ if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY))
+ schedule();
+ }
if (ti_work & _TIF_UPROBE)
uprobe_notify_resume(regs);
diff --git a/kernel/entry/common.h b/kernel/entry/common.h
deleted file mode 100644
index f6e6d02f07fe..000000000000
--- a/kernel/entry/common.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _COMMON_H
-#define _COMMON_H
-
-bool syscall_user_dispatch(struct pt_regs *regs);
-
-#endif
diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c
index 940a597ded40..cd4967a9c53e 100644
--- a/kernel/entry/syscall-common.c
+++ b/kernel/entry/syscall-common.c
@@ -1,104 +1,23 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/audit.h>
#include <linux/entry-common.h>
-#include "common.h"
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
-static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
-{
- if (unlikely(audit_context())) {
- unsigned long args[6];
-
- syscall_get_arguments(current, regs, args);
- audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
- }
-}
+/* Out of line to prevent tracepoint code duplication */
-long syscall_trace_enter(struct pt_regs *regs, long syscall,
- unsigned long work)
+long trace_syscall_enter(struct pt_regs *regs, long syscall)
{
- long ret = 0;
-
+ trace_sys_enter(regs, syscall);
/*
- * Handle Syscall User Dispatch. This must comes first, since
- * the ABI here can be something that doesn't make sense for
- * other syscall_work features.
+ * Probes or BPF hooks in the tracepoint may have changed the
+ * system call number. Reread it.
*/
- if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
- if (syscall_user_dispatch(regs))
- return -1L;
- }
-
- /* Handle ptrace */
- if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
- ret = ptrace_report_syscall_entry(regs);
- if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
- return -1L;
- }
-
- /* Do seccomp after ptrace, to catch any tracer changes. */
- if (work & SYSCALL_WORK_SECCOMP) {
- ret = __secure_computing();
- if (ret == -1L)
- return ret;
- }
-
- /* Either of the above might have changed the syscall number */
- syscall = syscall_get_nr(current, regs);
-
- if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) {
- trace_sys_enter(regs, syscall);
- /*
- * Probes or BPF hooks in the tracepoint may have changed the
- * system call number as well.
- */
- syscall = syscall_get_nr(current, regs);
- }
-
- syscall_enter_audit(regs, syscall);
-
- return ret ? : syscall;
+ return syscall_get_nr(current, regs);
}
-/*
- * If SYSCALL_EMU is set, then the only reason to report is when
- * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
- * instruction has been already reported in syscall_enter_from_user_mode().
- */
-static inline bool report_single_step(unsigned long work)
+void trace_syscall_exit(struct pt_regs *regs, long ret)
{
- if (work & SYSCALL_WORK_SYSCALL_EMU)
- return false;
-
- return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
-}
-
-void syscall_exit_work(struct pt_regs *regs, unsigned long work)
-{
- bool step;
-
- /*
- * If the syscall was rolled back due to syscall user dispatching,
- * then the tracers below are not invoked for the same reason as
- * the entry side was not invoked in syscall_trace_enter(): The ABI
- * of these syscalls is unknown.
- */
- if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
- if (unlikely(current->syscall_dispatch.on_dispatch)) {
- current->syscall_dispatch.on_dispatch = false;
- return;
- }
- }
-
- audit_syscall_exit(regs);
-
- if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
- trace_sys_exit(regs, syscall_get_return_value(current, regs));
-
- step = report_single_step(work);
- if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
- ptrace_report_syscall_exit(regs, step);
+ trace_sys_exit(regs, ret);
}
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index a9055eccb27e..d89dffcc2d64 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -2,6 +2,8 @@
/*
* Copyright (C) 2020 Collabora Ltd.
*/
+
+#include <linux/entry-common.h>
#include <linux/sched.h>
#include <linux/prctl.h>
#include <linux/ptrace.h>
@@ -15,8 +17,6 @@
#include <asm/syscall.h>
-#include "common.h"
-
static void trigger_sigsys(struct pt_regs *regs)
{
struct kernel_siginfo info;
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 1f6589578703..9d24b6e0c91f 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -246,7 +246,7 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
if (user && !crosstask) {
if (!user_mode(regs)) {
- if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
+ if (!is_user_task(current))
goto exit_put;
regs = task_pt_regs(current);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5b5cb620499e..e18119f30c29 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -57,6 +57,7 @@
#include <linux/task_work.h>
#include <linux/percpu-rwsem.h>
#include <linux/unwind_deferred.h>
+#include <linux/kvm_types.h>
#include "internal.h"
@@ -166,6 +167,18 @@ enum event_type_t {
EVENT_CPU = 0x10,
EVENT_CGROUP = 0x20,
+ /*
+ * EVENT_GUEST is set when scheduling in/out events between the host
+ * and a guest with a mediated vPMU. Among other things, EVENT_GUEST
+ * is used:
+ *
+ * - In for_each_epc() to skip PMUs that don't support events in a
+ * MEDIATED_VPMU guest, i.e. don't need to be context switched.
+ * - To indicate the start/end point of the events in a guest. Guest
+ * running time is deducted for host-only (exclude_guest) events.
+ */
+ EVENT_GUEST = 0x40,
+ EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST,
/* compound helpers */
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
@@ -458,6 +471,20 @@ static cpumask_var_t perf_online_pkg_mask;
static cpumask_var_t perf_online_sys_mask;
static struct kmem_cache *perf_event_cache;
+#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU
+static DEFINE_PER_CPU(bool, guest_ctx_loaded);
+
+static __always_inline bool is_guest_mediated_pmu_loaded(void)
+{
+ return __this_cpu_read(guest_ctx_loaded);
+}
+#else
+static __always_inline bool is_guest_mediated_pmu_loaded(void)
+{
+ return false;
+}
+#endif
+
/*
* perf event paranoia level:
* -1 - not paranoid at all
@@ -779,33 +806,97 @@ do { \
___p; \
})
-#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \
+static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type)
+{
+ if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups)
+ return true;
+ if ((event_type & EVENT_GUEST) &&
+ !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU))
+ return true;
+ return false;
+}
+
+#define for_each_epc(_epc, _ctx, _pmu, _event_type) \
list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
- if (_cgroup && !_epc->nr_cgroups) \
+ if (perf_skip_pmu_ctx(_epc, _event_type)) \
continue; \
else if (_pmu && _epc->pmu != _pmu) \
continue; \
else
-static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
+static void perf_ctx_disable(struct perf_event_context *ctx,
+ enum event_type_t event_type)
{
struct perf_event_pmu_context *pmu_ctx;
- for_each_epc(pmu_ctx, ctx, NULL, cgroup)
+ for_each_epc(pmu_ctx, ctx, NULL, event_type)
perf_pmu_disable(pmu_ctx->pmu);
}
-static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
+static void perf_ctx_enable(struct perf_event_context *ctx,
+ enum event_type_t event_type)
{
struct perf_event_pmu_context *pmu_ctx;
- for_each_epc(pmu_ctx, ctx, NULL, cgroup)
+ for_each_epc(pmu_ctx, ctx, NULL, event_type)
perf_pmu_enable(pmu_ctx->pmu);
}
static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
+static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv)
+{
+ if (adv)
+ time->time += now - time->stamp;
+ time->stamp = now;
+
+ /*
+ * The above: time' = time + (now - timestamp), can be re-arranged
+ * into: time` = now + (time - timestamp), which gives a single value
+ * offset to compute future time without locks on.
+ *
+ * See perf_event_time_now(), which can be used from NMI context where
+ * it's (obviously) not possible to acquire ctx->lock in order to read
+ * both the above values in a consistent manner.
+ */
+ WRITE_ONCE(time->offset, time->time - time->stamp);
+}
+
+static_assert(offsetof(struct perf_event_context, timeguest) -
+ offsetof(struct perf_event_context, time) ==
+ sizeof(struct perf_time_ctx));
+
+#define T_TOTAL 0
+#define T_GUEST 1
+
+static inline u64 __perf_event_time_ctx(struct perf_event *event,
+ struct perf_time_ctx *times)
+{
+ u64 time = times[T_TOTAL].time;
+
+ if (event->attr.exclude_guest)
+ time -= times[T_GUEST].time;
+
+ return time;
+}
+
+static inline u64 __perf_event_time_ctx_now(struct perf_event *event,
+ struct perf_time_ctx *times,
+ u64 now)
+{
+ if (is_guest_mediated_pmu_loaded() && event->attr.exclude_guest) {
+ /*
+ * (now + times[total].offset) - (now + times[guest].offset) :=
+ * times[total].offset - times[guest].offset
+ */
+ return READ_ONCE(times[T_TOTAL].offset) - READ_ONCE(times[T_GUEST].offset);
+ }
+
+ return now + READ_ONCE(times[T_TOTAL].offset);
+}
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
@@ -842,12 +933,16 @@ static inline int is_cgroup_event(struct perf_event *event)
return event->cgrp != NULL;
}
+static_assert(offsetof(struct perf_cgroup_info, timeguest) -
+ offsetof(struct perf_cgroup_info, time) ==
+ sizeof(struct perf_time_ctx));
+
static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
struct perf_cgroup_info *t;
t = per_cpu_ptr(event->cgrp->info, event->cpu);
- return t->time;
+ return __perf_event_time_ctx(event, &t->time);
}
static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
@@ -856,20 +951,21 @@ static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
t = per_cpu_ptr(event->cgrp->info, event->cpu);
if (!__load_acquire(&t->active))
- return t->time;
- now += READ_ONCE(t->timeoffset);
- return now;
+ return __perf_event_time_ctx(event, &t->time);
+
+ return __perf_event_time_ctx_now(event, &t->time, now);
}
-static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
+static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv)
{
- if (adv)
- info->time += now - info->timestamp;
- info->timestamp = now;
- /*
- * see update_context_time()
- */
- WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
+ update_perf_time_ctx(&info->timeguest, now, adv);
+}
+
+static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now)
+{
+ update_perf_time_ctx(&info->time, now, true);
+ if (is_guest_mediated_pmu_loaded())
+ __update_cgrp_guest_time(info, now, true);
}
static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
@@ -885,7 +981,7 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
cgrp = container_of(css, struct perf_cgroup, css);
info = this_cpu_ptr(cgrp->info);
- __update_cgrp_time(info, now, true);
+ update_cgrp_time(info, now);
if (final)
__store_release(&info->active, 0);
}
@@ -908,11 +1004,11 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
* Do not update time when cgroup is not active
*/
if (info->active)
- __update_cgrp_time(info, perf_clock(), true);
+ update_cgrp_time(info, perf_clock());
}
static inline void
-perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest)
{
struct perf_event_context *ctx = &cpuctx->ctx;
struct perf_cgroup *cgrp = cpuctx->cgrp;
@@ -932,8 +1028,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
info = this_cpu_ptr(cgrp->info);
- __update_cgrp_time(info, ctx->timestamp, false);
- __store_release(&info->active, 1);
+ if (guest) {
+ __update_cgrp_guest_time(info, ctx->time.stamp, false);
+ } else {
+ update_perf_time_ctx(&info->time, ctx->time.stamp, false);
+ __store_release(&info->active, 1);
+ }
}
}
@@ -964,8 +1064,7 @@ static void perf_cgroup_switch(struct task_struct *task)
return;
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
-
- perf_ctx_disable(&cpuctx->ctx, true);
+ perf_ctx_disable(&cpuctx->ctx, EVENT_CGROUP);
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
/*
@@ -981,7 +1080,7 @@ static void perf_cgroup_switch(struct task_struct *task)
*/
ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
- perf_ctx_enable(&cpuctx->ctx, true);
+ perf_ctx_enable(&cpuctx->ctx, EVENT_CGROUP);
}
static int perf_cgroup_ensure_storage(struct perf_event *event,
@@ -1138,7 +1237,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
}
static inline void
-perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest)
{
}
@@ -1550,29 +1649,24 @@ static void perf_unpin_context(struct perf_event_context *ctx)
*/
static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
- u64 now = perf_clock();
-
lockdep_assert_held(&ctx->lock);
- if (adv)
- ctx->time += now - ctx->timestamp;
- ctx->timestamp = now;
+ update_perf_time_ctx(&ctx->time, perf_clock(), adv);
+}
- /*
- * The above: time' = time + (now - timestamp), can be re-arranged
- * into: time` = now + (time - timestamp), which gives a single value
- * offset to compute future time without locks on.
- *
- * See perf_event_time_now(), which can be used from NMI context where
- * it's (obviously) not possible to acquire ctx->lock in order to read
- * both the above values in a consistent manner.
- */
- WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
+static void __update_context_guest_time(struct perf_event_context *ctx, bool adv)
+{
+ lockdep_assert_held(&ctx->lock);
+
+ /* must be called after __update_context_time(); */
+ update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv);
}
static void update_context_time(struct perf_event_context *ctx)
{
__update_context_time(ctx, true);
+ if (is_guest_mediated_pmu_loaded())
+ __update_context_guest_time(ctx, true);
}
static u64 perf_event_time(struct perf_event *event)
@@ -1585,7 +1679,7 @@ static u64 perf_event_time(struct perf_event *event)
if (is_cgroup_event(event))
return perf_cgroup_event_time(event);
- return ctx->time;
+ return __perf_event_time_ctx(event, &ctx->time);
}
static u64 perf_event_time_now(struct perf_event *event, u64 now)
@@ -1599,10 +1693,9 @@ static u64 perf_event_time_now(struct perf_event *event, u64 now)
return perf_cgroup_event_time_now(event, now);
if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
- return ctx->time;
+ return __perf_event_time_ctx(event, &ctx->time);
- now += READ_ONCE(ctx->timeoffset);
- return now;
+ return __perf_event_time_ctx_now(event, &ctx->time, now);
}
static enum event_type_t get_event_type(struct perf_event *event)
@@ -2422,20 +2515,23 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
}
static inline void
-__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
+__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx,
+ bool final, enum event_type_t event_type)
{
if (ctx->is_active & EVENT_TIME) {
if (ctx->is_active & EVENT_FROZEN)
return;
+
update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx, final);
+ /* vPMU should not stop time */
+ update_cgrp_time_from_cpuctx(cpuctx, !(event_type & EVENT_GUEST) && final);
}
}
static inline void
ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
{
- __ctx_time_update(cpuctx, ctx, false);
+ __ctx_time_update(cpuctx, ctx, false, 0);
}
/*
@@ -2861,14 +2957,15 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
- struct pmu *pmu)
+ struct pmu *pmu,
+ enum event_type_t event_type)
{
- ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED | event_type);
if (ctx)
- ctx_sched_in(ctx, pmu, EVENT_PINNED);
- ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, pmu, EVENT_PINNED | event_type);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE | event_type);
if (ctx)
- ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE | event_type);
}
/*
@@ -2902,11 +2999,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
event_type &= EVENT_ALL;
- for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ for_each_epc(epc, &cpuctx->ctx, pmu, 0)
perf_pmu_disable(epc->pmu);
if (task_ctx) {
- for_each_epc(epc, task_ctx, pmu, false)
+ for_each_epc(epc, task_ctx, pmu, 0)
perf_pmu_disable(epc->pmu);
task_ctx_sched_out(task_ctx, pmu, event_type);
@@ -2924,13 +3021,13 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
else if (event_type & EVENT_PINNED)
ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
- perf_event_sched_in(cpuctx, task_ctx, pmu);
+ perf_event_sched_in(cpuctx, task_ctx, pmu, 0);
- for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ for_each_epc(epc, &cpuctx->ctx, pmu, 0)
perf_pmu_enable(epc->pmu);
if (task_ctx) {
- for_each_epc(epc, task_ctx, pmu, false)
+ for_each_epc(epc, task_ctx, pmu, 0)
perf_pmu_enable(epc->pmu);
}
}
@@ -3479,11 +3576,10 @@ static void
ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ enum event_type_t active_type = event_type & ~EVENT_FLAGS;
struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
- bool cgroup = event_type & EVENT_CGROUP;
- event_type &= ~EVENT_CGROUP;
lockdep_assert_held(&ctx->lock);
@@ -3507,14 +3603,14 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
*
* would only update time for the pinned events.
*/
- __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
+ __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx, event_type);
/*
* CPU-release for the below ->is_active store,
* see __load_acquire() in perf_event_time_now()
*/
barrier();
- ctx->is_active &= ~event_type;
+ ctx->is_active &= ~active_type;
if (!(ctx->is_active & EVENT_ALL)) {
/*
@@ -3533,9 +3629,20 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
cpuctx->task_ctx = NULL;
}
- is_active ^= ctx->is_active; /* changed bits */
+ if (event_type & EVENT_GUEST) {
+ /*
+ * Schedule out all exclude_guest events of PMU
+ * with PERF_PMU_CAP_MEDIATED_VPMU.
+ */
+ is_active = EVENT_ALL;
+ __update_context_guest_time(ctx, false);
+ perf_cgroup_set_timestamp(cpuctx, true);
+ barrier();
+ } else {
+ is_active ^= ctx->is_active; /* changed bits */
+ }
- for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ for_each_epc(pmu_ctx, ctx, pmu, event_type)
__pmu_ctx_sched_out(pmu_ctx, is_active);
}
@@ -3691,7 +3798,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
- perf_ctx_disable(ctx, false);
+ perf_ctx_disable(ctx, 0);
/* PMIs are disabled; ctx->nr_no_switch_fast is stable. */
if (local_read(&ctx->nr_no_switch_fast) ||
@@ -3715,7 +3822,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
perf_ctx_sched_task_cb(ctx, task, false);
- perf_ctx_enable(ctx, false);
+ perf_ctx_enable(ctx, 0);
/*
* RCU_INIT_POINTER here is safe because we've not
@@ -3739,13 +3846,13 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- perf_ctx_disable(ctx, false);
+ perf_ctx_disable(ctx, 0);
inside_switch:
perf_ctx_sched_task_cb(ctx, task, false);
task_ctx_sched_out(ctx, NULL, EVENT_ALL);
- perf_ctx_enable(ctx, false);
+ perf_ctx_enable(ctx, 0);
raw_spin_unlock(&ctx->lock);
}
}
@@ -3992,10 +4099,15 @@ static inline void group_update_userpage(struct perf_event *group_event)
event_update_userpage(event);
}
+struct merge_sched_data {
+ int can_add_hw;
+ enum event_type_t event_type;
+};
+
static int merge_sched_in(struct perf_event *event, void *data)
{
struct perf_event_context *ctx = event->ctx;
- int *can_add_hw = data;
+ struct merge_sched_data *msd = data;
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
@@ -4003,13 +4115,22 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (!event_filter_match(event))
return 0;
- if (group_can_go_on(event, *can_add_hw)) {
+ /*
+ * Don't schedule in any host events from PMU with
+ * PERF_PMU_CAP_MEDIATED_VPMU, while a guest is running.
+ */
+ if (is_guest_mediated_pmu_loaded() &&
+ event->pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU &&
+ !(msd->event_type & EVENT_GUEST))
+ return 0;
+
+ if (group_can_go_on(event, msd->can_add_hw)) {
if (!group_sched_in(event, ctx))
list_add_tail(&event->active_list, get_event_list(event));
}
if (event->state == PERF_EVENT_STATE_INACTIVE) {
- *can_add_hw = 0;
+ msd->can_add_hw = 0;
if (event->attr.pinned) {
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
@@ -4032,11 +4153,15 @@ static int merge_sched_in(struct perf_event *event, void *data)
static void pmu_groups_sched_in(struct perf_event_context *ctx,
struct perf_event_groups *groups,
- struct pmu *pmu)
+ struct pmu *pmu,
+ enum event_type_t event_type)
{
- int can_add_hw = 1;
+ struct merge_sched_data msd = {
+ .can_add_hw = 1,
+ .event_type = event_type,
+ };
visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
- merge_sched_in, &can_add_hw);
+ merge_sched_in, &msd);
}
static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
@@ -4045,20 +4170,18 @@ static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
struct perf_event_context *ctx = pmu_ctx->ctx;
if (event_type & EVENT_PINNED)
- pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
+ pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu, event_type);
if (event_type & EVENT_FLEXIBLE)
- pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
+ pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu, event_type);
}
static void
ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ enum event_type_t active_type = event_type & ~EVENT_FLAGS;
struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
- bool cgroup = event_type & EVENT_CGROUP;
-
- event_type &= ~EVENT_CGROUP;
lockdep_assert_held(&ctx->lock);
@@ -4066,9 +4189,11 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
return;
if (!(is_active & EVENT_TIME)) {
+ /* EVENT_TIME should be active while the guest runs */
+ WARN_ON_ONCE(event_type & EVENT_GUEST);
/* start ctx time */
__update_context_time(ctx, false);
- perf_cgroup_set_timestamp(cpuctx);
+ perf_cgroup_set_timestamp(cpuctx, false);
/*
* CPU-release for the below ->is_active store,
* see __load_acquire() in perf_event_time_now()
@@ -4076,7 +4201,7 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
barrier();
}
- ctx->is_active |= (event_type | EVENT_TIME);
+ ctx->is_active |= active_type | EVENT_TIME;
if (ctx->task) {
if (!(is_active & EVENT_ALL))
cpuctx->task_ctx = ctx;
@@ -4084,21 +4209,37 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
}
- is_active ^= ctx->is_active; /* changed bits */
+ if (event_type & EVENT_GUEST) {
+ /*
+ * Schedule in the required exclude_guest events of PMU
+ * with PERF_PMU_CAP_MEDIATED_VPMU.
+ */
+ is_active = event_type & EVENT_ALL;
+
+ /*
+ * Update ctx time to set the new start time for
+ * the exclude_guest events.
+ */
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx, false);
+ barrier();
+ } else {
+ is_active ^= ctx->is_active; /* changed bits */
+ }
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED) {
- for_each_epc(pmu_ctx, ctx, pmu, cgroup)
- __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
+ for_each_epc(pmu_ctx, ctx, pmu, event_type)
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED | (event_type & EVENT_GUEST));
}
/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE) {
- for_each_epc(pmu_ctx, ctx, pmu, cgroup)
- __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
+ for_each_epc(pmu_ctx, ctx, pmu, event_type)
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE | (event_type & EVENT_GUEST));
}
}
@@ -4114,11 +4255,11 @@ static void perf_event_context_sched_in(struct task_struct *task)
if (cpuctx->task_ctx == ctx) {
perf_ctx_lock(cpuctx, ctx);
- perf_ctx_disable(ctx, false);
+ perf_ctx_disable(ctx, 0);
perf_ctx_sched_task_cb(ctx, task, true);
- perf_ctx_enable(ctx, false);
+ perf_ctx_enable(ctx, 0);
perf_ctx_unlock(cpuctx, ctx);
goto rcu_unlock;
}
@@ -4131,7 +4272,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
if (!ctx->nr_events)
goto unlock;
- perf_ctx_disable(ctx, false);
+ perf_ctx_disable(ctx, 0);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -4141,18 +4282,18 @@ static void perf_event_context_sched_in(struct task_struct *task)
* events, no need to flip the cpuctx's events around.
*/
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
- perf_ctx_disable(&cpuctx->ctx, false);
+ perf_ctx_disable(&cpuctx->ctx, 0);
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
}
- perf_event_sched_in(cpuctx, ctx, NULL);
+ perf_event_sched_in(cpuctx, ctx, NULL, 0);
perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true);
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
- perf_ctx_enable(&cpuctx->ctx, false);
+ perf_ctx_enable(&cpuctx->ctx, 0);
- perf_ctx_enable(ctx, false);
+ perf_ctx_enable(ctx, 0);
unlock:
perf_ctx_unlock(cpuctx, ctx);
@@ -5280,9 +5421,20 @@ attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
return -ENOMEM;
for (;;) {
- if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) {
+ if (try_cmpxchg(&task->perf_ctx_data, &old, cd)) {
if (old)
perf_free_ctx_data_rcu(old);
+ /*
+ * Above try_cmpxchg() pairs with try_cmpxchg() from
+ * detach_task_ctx_data() such that
+ * if we race with perf_event_exit_task(), we must
+ * observe PF_EXITING.
+ */
+ if (task->flags & PF_EXITING) {
+ /* detach_task_ctx_data() may free it already */
+ if (try_cmpxchg(&task->perf_ctx_data, &cd, NULL))
+ perf_free_ctx_data_rcu(cd);
+ }
return 0;
}
@@ -5328,6 +5480,8 @@ again:
/* Allocate everything */
scoped_guard (rcu) {
for_each_process_thread(g, p) {
+ if (p->flags & PF_EXITING)
+ continue;
cd = rcu_dereference(p->perf_ctx_data);
if (cd && !cd->global) {
cd->global = 1;
@@ -5594,6 +5748,8 @@ static void __free_event(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
+ security_perf_event_free(event);
+
if (event->attach_state & PERF_ATTACH_CALLCHAIN)
put_callchain_buffers();
@@ -5647,6 +5803,8 @@ static void __free_event(struct perf_event *event)
call_rcu(&event->rcu_head, free_event_rcu);
}
+static void mediated_pmu_unaccount_event(struct perf_event *event);
+
DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T))
/* vs perf_event_alloc() success */
@@ -5656,8 +5814,7 @@ static void _free_event(struct perf_event *event)
irq_work_sync(&event->pending_disable_irq);
unaccount_event(event);
-
- security_perf_event_free(event);
+ mediated_pmu_unaccount_event(event);
if (event->rb) {
/*
@@ -6180,6 +6337,138 @@ u64 perf_event_pause(struct perf_event *event, bool reset)
}
EXPORT_SYMBOL_GPL(perf_event_pause);
+#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU
+static atomic_t nr_include_guest_events __read_mostly;
+
+static atomic_t nr_mediated_pmu_vms __read_mostly;
+static DEFINE_MUTEX(perf_mediated_pmu_mutex);
+
+/* !exclude_guest event of PMU with PERF_PMU_CAP_MEDIATED_VPMU */
+static inline bool is_include_guest_event(struct perf_event *event)
+{
+ if ((event->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU) &&
+ !event->attr.exclude_guest)
+ return true;
+
+ return false;
+}
+
+static int mediated_pmu_account_event(struct perf_event *event)
+{
+ if (!is_include_guest_event(event))
+ return 0;
+
+ if (atomic_inc_not_zero(&nr_include_guest_events))
+ return 0;
+
+ guard(mutex)(&perf_mediated_pmu_mutex);
+ if (atomic_read(&nr_mediated_pmu_vms))
+ return -EOPNOTSUPP;
+
+ atomic_inc(&nr_include_guest_events);
+ return 0;
+}
+
+static void mediated_pmu_unaccount_event(struct perf_event *event)
+{
+ if (!is_include_guest_event(event))
+ return;
+
+ if (WARN_ON_ONCE(!atomic_read(&nr_include_guest_events)))
+ return;
+
+ atomic_dec(&nr_include_guest_events);
+}
+
+/*
+ * Currently invoked at VM creation to
+ * - Check whether there are existing !exclude_guest events of PMU with
+ * PERF_PMU_CAP_MEDIATED_VPMU
+ * - Set nr_mediated_pmu_vms to prevent !exclude_guest event creation on
+ * PMUs with PERF_PMU_CAP_MEDIATED_VPMU
+ *
+ * No impact for the PMU without PERF_PMU_CAP_MEDIATED_VPMU. The perf
+ * still owns all the PMU resources.
+ */
+int perf_create_mediated_pmu(void)
+{
+ if (atomic_inc_not_zero(&nr_mediated_pmu_vms))
+ return 0;
+
+ guard(mutex)(&perf_mediated_pmu_mutex);
+ if (atomic_read(&nr_include_guest_events))
+ return -EBUSY;
+
+ atomic_inc(&nr_mediated_pmu_vms);
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM(perf_create_mediated_pmu);
+
+void perf_release_mediated_pmu(void)
+{
+ if (WARN_ON_ONCE(!atomic_read(&nr_mediated_pmu_vms)))
+ return;
+
+ atomic_dec(&nr_mediated_pmu_vms);
+}
+EXPORT_SYMBOL_FOR_KVM(perf_release_mediated_pmu);
+
+/* When loading a guest's mediated PMU, schedule out all exclude_guest events. */
+void perf_load_guest_context(void)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+ lockdep_assert_irqs_disabled();
+
+ guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx);
+
+ if (WARN_ON_ONCE(__this_cpu_read(guest_ctx_loaded)))
+ return;
+
+ perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
+ ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
+ if (cpuctx->task_ctx) {
+ perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
+ task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
+ }
+
+ perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
+ if (cpuctx->task_ctx)
+ perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
+
+ __this_cpu_write(guest_ctx_loaded, true);
+}
+EXPORT_SYMBOL_GPL(perf_load_guest_context);
+
+void perf_put_guest_context(void)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+ lockdep_assert_irqs_disabled();
+
+ guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx);
+
+ if (WARN_ON_ONCE(!__this_cpu_read(guest_ctx_loaded)))
+ return;
+
+ perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
+ if (cpuctx->task_ctx)
+ perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
+
+ perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST);
+
+ if (cpuctx->task_ctx)
+ perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
+ perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
+
+ __this_cpu_write(guest_ctx_loaded, false);
+}
+EXPORT_SYMBOL_GPL(perf_put_guest_context);
+#else
+static int mediated_pmu_account_event(struct perf_event *event) { return 0; }
+static void mediated_pmu_unaccount_event(struct perf_event *event) {}
+#endif
+
/*
* Holding the top-level event's child_mutex means that any
* descendant process that has inherited this event will block
@@ -6548,22 +6837,22 @@ void perf_event_update_userpage(struct perf_event *event)
goto unlock;
/*
- * compute total_time_enabled, total_time_running
- * based on snapshot values taken when the event
- * was last scheduled in.
+ * Disable preemption to guarantee consistent time stamps are stored to
+ * the user page.
+ */
+ preempt_disable();
+
+ /*
+ * Compute total_time_enabled, total_time_running based on snapshot
+ * values taken when the event was last scheduled in.
*
- * we cannot simply called update_context_time()
- * because of locking issue as we can be called in
- * NMI context
+ * We cannot simply call update_context_time() because doing so would
+ * lead to deadlock when called from NMI context.
*/
calc_timer_values(event, &now, &enabled, &running);
userpg = rb->user_page;
- /*
- * Disable preemption to guarantee consistent time stamps are stored to
- * the user page.
- */
- preempt_disable();
+
++userpg->lock;
barrier();
userpg->index = perf_event_index(event);
@@ -6997,6 +7286,15 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
if (data_page_nr(event->rb) != nr_pages)
return -EINVAL;
+ /*
+ * If this event doesn't have mmap_count, we're attempting to
+ * create an alias of another event's mmap(); this would mean
+ * both events will end up scribbling the same user_page;
+ * which makes no sense.
+ */
+ if (!refcount_read(&event->mmap_count))
+ return -EBUSY;
+
if (refcount_inc_not_zero(&event->rb->mmap_count)) {
/*
* Success -- managed to mmap() the same buffer
@@ -7374,6 +7672,7 @@ struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
+DEFINE_STATIC_CALL_RET0(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi);
void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
@@ -7388,6 +7687,10 @@ void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
if (cbs->handle_intel_pt_intr)
static_call_update(__perf_guest_handle_intel_pt_intr,
cbs->handle_intel_pt_intr);
+
+ if (cbs->handle_mediated_pmi)
+ static_call_update(__perf_guest_handle_mediated_pmi,
+ cbs->handle_mediated_pmi);
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
@@ -7399,8 +7702,8 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
rcu_assign_pointer(perf_guest_cbs, NULL);
static_call_update(__perf_guest_state, (void *)&__static_call_return0);
static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
- static_call_update(__perf_guest_handle_intel_pt_intr,
- (void *)&__static_call_return0);
+ static_call_update(__perf_guest_handle_intel_pt_intr, (void *)&__static_call_return0);
+ static_call_update(__perf_guest_handle_mediated_pmi, (void *)&__static_call_return0);
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
@@ -7451,7 +7754,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
if (user_mode(regs)) {
regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
- } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
+ } else if (is_user_task(current)) {
perf_get_regs_user(regs_user, regs);
} else {
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
@@ -7860,13 +8163,11 @@ static void perf_output_read(struct perf_output_handle *handle,
u64 read_format = event->attr.read_format;
/*
- * compute total_time_enabled, total_time_running
- * based on snapshot values taken when the event
- * was last scheduled in.
+ * Compute total_time_enabled, total_time_running based on snapshot
+ * values taken when the event was last scheduled in.
*
- * we cannot simply called update_context_time()
- * because of locking issue as we are called in
- * NMI context
+ * We cannot simply call update_context_time() because doing so would
+ * lead to deadlock when called from NMI context.
*/
if (read_format & PERF_FORMAT_TOTAL_TIMES)
calc_timer_values(event, &now, &enabled, &running);
@@ -8091,7 +8392,7 @@ static u64 perf_virt_to_phys(u64 virt)
* Try IRQ-safe get_user_page_fast_only first.
* If failed, leave phys_addr as 0.
*/
- if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
+ if (is_user_task(current)) {
struct page *p;
pagefault_disable();
@@ -8206,7 +8507,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
bool kernel = !event->attr.exclude_callchain_kernel;
bool user = !event->attr.exclude_callchain_user &&
- !(current->flags & (PF_KTHREAD | PF_USER_WORKER));
+ is_user_task(current);
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user &&
@@ -12034,7 +12335,7 @@ static void task_clock_event_update(struct perf_event *event, u64 now)
static void task_clock_event_start(struct perf_event *event, int flags)
{
event->hw.state = 0;
- local64_set(&event->hw.prev_count, event->ctx->time);
+ local64_set(&event->hw.prev_count, event->ctx->time.time);
perf_swevent_start_hrtimer(event);
}
@@ -12043,7 +12344,7 @@ static void task_clock_event_stop(struct perf_event *event, int flags)
event->hw.state = PERF_HES_STOPPED;
perf_swevent_cancel_hrtimer(event);
if (flags & PERF_EF_UPDATE)
- task_clock_event_update(event, event->ctx->time);
+ task_clock_event_update(event, event->ctx->time.time);
}
static int task_clock_event_add(struct perf_event *event, int flags)
@@ -12063,8 +12364,8 @@ static void task_clock_event_del(struct perf_event *event, int flags)
static void task_clock_event_read(struct perf_event *event)
{
u64 now = perf_clock();
- u64 delta = now - event->ctx->timestamp;
- u64 time = event->ctx->time + delta;
+ u64 delta = now - event->ctx->time.stamp;
+ u64 time = event->ctx->time.time + delta;
task_clock_event_update(event, time);
}
@@ -13146,6 +13447,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (err)
return ERR_PTR(err);
+ err = mediated_pmu_account_event(event);
+ if (err)
+ return ERR_PTR(err);
+
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
@@ -14285,8 +14590,11 @@ void perf_event_exit_task(struct task_struct *task)
/*
* Detach the perf_ctx_data for the system-wide event.
+ *
+ * Done without holding global_ctx_data_rwsem; typically
+ * attach_global_ctx_data() will skip over this task, but otherwise
+ * attach_task_ctx_data() will observe PF_EXITING.
*/
- guard(percpu_read)(&global_ctx_data_rwsem);
detach_task_ctx_data(task);
}
@@ -14789,7 +15097,8 @@ static void perf_event_exit_cpu_context(int cpu)
ctx = &cpuctx->ctx;
mutex_lock(&ctx->mutex);
- smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+ if (ctx->nr_events)
+ smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
cpuctx->online = 0;
mutex_unlock(&ctx->mutex);
mutex_unlock(&pmus_lock);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d546d32390a8..424ef2235b07 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -179,16 +179,16 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn)
void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
{
- void *kaddr = kmap_atomic(page);
+ void *kaddr = kmap_local_page(page);
memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
}
static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
{
- void *kaddr = kmap_atomic(page);
+ void *kaddr = kmap_local_page(page);
memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
}
static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn,
@@ -323,7 +323,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
return ret == 0 ? -EBUSY : ret;
}
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_page(page);
ptr = kaddr + (vaddr & ~PAGE_MASK);
if (unlikely(*ptr + d < 0)) {
@@ -336,7 +336,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
*ptr += d;
ret = 0;
out:
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
put_page(page);
return ret;
}
@@ -1138,7 +1138,7 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
bool ret = false;
down_read(&uprobe->consumer_rwsem);
- list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
+ list_for_each_entry(uc, &uprobe->consumers, cons_node) {
ret = consumer_filter(uc, mm);
if (ret)
break;
@@ -1694,6 +1694,12 @@ static const struct vm_special_mapping xol_mapping = {
.mremap = xol_mremap,
};
+unsigned long __weak arch_uprobe_get_xol_area(void)
+{
+ /* Try to map as high as possible, this is only a hint. */
+ return get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
+}
+
/* Slot allocation for XOL */
static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
@@ -1709,9 +1715,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
}
if (!area->vaddr) {
- /* Try to map as high as possible, this is only a hint. */
- area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
- PAGE_SIZE, 0, 0);
+ area->vaddr = arch_uprobe_get_xol_area();
if (IS_ERR_VALUE(area->vaddr)) {
ret = area->vaddr;
goto fail;
diff --git a/kernel/fork.c b/kernel/fork.c
index b1f3915d5f8e..e832da9d15a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -97,6 +97,7 @@
#include <linux/kasan.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
+#include <linux/io_uring_types.h>
#include <linux/bpf.h>
#include <linux/stackprotector.h>
#include <linux/user_events.h>
@@ -1356,7 +1357,7 @@ struct file *get_task_exe_file(struct task_struct *task)
* @task: The task.
*
* Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
- * this kernel workthread has transiently adopted a user mm with use_mm,
+ * this kernel workthread has transiently adopted a user mm with kthread_use_mm,
* to do its AIO) is not set and if so returns a reference to it, after
* bumping up the use count. User must release the mm via mmput()
* after use. Typically used by /proc and ptrace.
@@ -1828,9 +1829,6 @@ static inline void rcu_copy_process(struct task_struct *p)
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
p->trc_reader_nesting = 0;
- p->trc_reader_special.s = 0;
- INIT_LIST_HEAD(&p->trc_holdout_list);
- INIT_LIST_HEAD(&p->trc_blkd_node);
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
@@ -2071,7 +2069,7 @@ __latent_entropy struct task_struct *copy_process(
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
- * Clear TID on mm_release()?
+ * TID is cleared in mm_release() when the task exits
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
@@ -2129,6 +2127,10 @@ __latent_entropy struct task_struct *copy_process(
#ifdef CONFIG_IO_URING
p->io_uring = NULL;
+ retval = io_uring_fork(p);
+ if (unlikely(retval))
+ goto bad_fork_cleanup_delayacct;
+ retval = -EAGAIN;
#endif
p->default_timer_slack_ns = current->timer_slack_ns;
@@ -2525,6 +2527,7 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
#endif
bad_fork_cleanup_delayacct:
+ io_uring_free(p);
delayacct_tsk_free(p);
bad_fork_cleanup_count:
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 1b4254d19a73..05cba4e16dad 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -92,9 +92,6 @@ config GENERIC_MSI_IRQ
config IRQ_MSI_IOMMU
bool
-config IRQ_TIMINGS
- bool
-
config GENERIC_IRQ_MATRIX_ALLOCATOR
bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 6ab3a4055667..86a2e5ae08f9 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,10 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o kexec.o
-obj-$(CONFIG_IRQ_TIMINGS) += timings.o
-ifeq ($(CONFIG_TEST_IRQ_TIMINGS),y)
- CFLAGS_timings.o += -DDEBUG
-endif
obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 678f094d261a..6147a07d0127 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -974,7 +974,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
irq_state_set_disabled(desc);
if (is_chained) {
desc->action = NULL;
- WARN_ON(irq_chip_pm_put(irq_desc_get_irq_data(desc)));
+ irq_chip_pm_put(irq_desc_get_irq_data(desc));
}
desc->depth = 1;
}
@@ -1122,7 +1122,7 @@ void irq_cpu_offline(void)
}
#endif
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
#ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS
/**
@@ -1194,6 +1194,15 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq);
#endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */
+#ifdef CONFIG_SMP
+void irq_chip_pre_redirect_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_pre_redirect(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_pre_redirect_parent);
+#endif
+
/**
* irq_chip_set_parent_state - set the state of a parent interrupt.
*
@@ -1476,6 +1485,19 @@ void irq_chip_release_resources_parent(struct irq_data *data)
data->chip->irq_release_resources(data);
}
EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
+#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
+
+#ifdef CONFIG_SMP
+int irq_chip_redirect_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force)
+{
+ struct irq_redirect *redir = &irq_data_to_desc(data)->redirect;
+
+ WRITE_ONCE(redir->target_cpu, cpumask_first(dest));
+ irq_data_update_effective_affinity(data, dest);
+
+ return IRQ_SET_MASK_OK_DONE;
+}
+EXPORT_SYMBOL_GPL(irq_chip_redirect_set_affinity);
#endif
/**
@@ -1530,20 +1552,20 @@ int irq_chip_pm_get(struct irq_data *data)
}
/**
- * irq_chip_pm_put - Disable power for an IRQ chip
+ * irq_chip_pm_put - Drop a PM reference on an IRQ chip
* @data: Pointer to interrupt specific data
*
- * Disable the power to the IRQ chip referenced by the interrupt data
- * structure, belongs. Note that power will only be disabled, once this
- * function has been called for all IRQs that have called irq_chip_pm_get().
+ * Drop a power management reference, acquired via irq_chip_pm_get(), on the IRQ
+ * chip represented by the interrupt data structure.
+ *
+ * Note that this will not disable power to the IRQ chip until this function
+ * has been called for all IRQs that have called irq_chip_pm_get() and it may
+ * not disable power at all (if user space prevents that, for example).
*/
-int irq_chip_pm_put(struct irq_data *data)
+void irq_chip_pm_put(struct irq_data *data)
{
struct device *dev = irq_get_pm_device(data);
- int retval = 0;
-
- if (IS_ENABLED(CONFIG_PM) && dev)
- retval = pm_runtime_put(dev);
- return (retval < 0) ? retval : 0;
+ if (dev)
+ pm_runtime_put(dev);
}
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 755346ea9819..cd5689e383b0 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -177,9 +177,11 @@ void irq_migrate_all_off_this_cpu(void)
bool affinity_broken;
desc = irq_to_desc(irq);
- scoped_guard(raw_spinlock, &desc->lock)
+ scoped_guard(raw_spinlock, &desc->lock) {
affinity_broken = migrate_one_irq(desc);
-
+ if (affinity_broken && desc->affinity_notify)
+ irq_affinity_schedule_notify_work(desc);
+ }
if (affinity_broken) {
pr_debug_ratelimited("IRQ %u: no longer affine to CPU%u\n",
irq, smp_processor_id());
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 786f5570a640..b7d52821837b 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -188,8 +188,6 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
unsigned int irq = desc->irq_data.irq;
struct irqaction *action;
- record_irq_time(desc);
-
for_each_action_of_desc(desc, action) {
irqreturn_t res;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 0164ca48da59..9412e57056f5 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -135,6 +135,7 @@ extern bool irq_can_set_affinity_usr(unsigned int irq);
extern int irq_do_set_affinity(struct irq_data *data,
const struct cpumask *dest, bool force);
+extern void irq_affinity_schedule_notify_work(struct irq_desc *desc);
#ifdef CONFIG_SMP
extern int irq_setup_affinity(struct irq_desc *desc);
@@ -142,7 +143,6 @@ extern int irq_setup_affinity(struct irq_desc *desc);
static inline int irq_setup_affinity(struct irq_desc *desc) { return 0; }
#endif
-
#define for_each_action_of_desc(desc, act) \
for (act = desc->action; act; act = act->next)
@@ -288,116 +288,6 @@ static inline void
irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
#endif
-#ifdef CONFIG_IRQ_TIMINGS
-
-#define IRQ_TIMINGS_SHIFT 5
-#define IRQ_TIMINGS_SIZE (1 << IRQ_TIMINGS_SHIFT)
-#define IRQ_TIMINGS_MASK (IRQ_TIMINGS_SIZE - 1)
-
-/**
- * struct irq_timings - irq timings storing structure
- * @values: a circular buffer of u64 encoded <timestamp,irq> values
- * @count: the number of elements in the array
- */
-struct irq_timings {
- u64 values[IRQ_TIMINGS_SIZE];
- int count;
-};
-
-DECLARE_PER_CPU(struct irq_timings, irq_timings);
-
-extern void irq_timings_free(int irq);
-extern int irq_timings_alloc(int irq);
-
-static inline void irq_remove_timings(struct irq_desc *desc)
-{
- desc->istate &= ~IRQS_TIMINGS;
-
- irq_timings_free(irq_desc_get_irq(desc));
-}
-
-static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act)
-{
- int irq = irq_desc_get_irq(desc);
- int ret;
-
- /*
- * We don't need the measurement because the idle code already
- * knows the next expiry event.
- */
- if (act->flags & __IRQF_TIMER)
- return;
-
- /*
- * In case the timing allocation fails, we just want to warn,
- * not fail, so letting the system boot anyway.
- */
- ret = irq_timings_alloc(irq);
- if (ret) {
- pr_warn("Failed to allocate irq timing stats for irq%d (%d)",
- irq, ret);
- return;
- }
-
- desc->istate |= IRQS_TIMINGS;
-}
-
-extern void irq_timings_enable(void);
-extern void irq_timings_disable(void);
-
-DECLARE_STATIC_KEY_FALSE(irq_timing_enabled);
-
-/*
- * The interrupt number and the timestamp are encoded into a single
- * u64 variable to optimize the size.
- * 48 bit time stamp and 16 bit IRQ number is way sufficient.
- * Who cares an IRQ after 78 hours of idle time?
- */
-static inline u64 irq_timing_encode(u64 timestamp, int irq)
-{
- return (timestamp << 16) | irq;
-}
-
-static inline int irq_timing_decode(u64 value, u64 *timestamp)
-{
- *timestamp = value >> 16;
- return value & U16_MAX;
-}
-
-static __always_inline void irq_timings_push(u64 ts, int irq)
-{
- struct irq_timings *timings = this_cpu_ptr(&irq_timings);
-
- timings->values[timings->count & IRQ_TIMINGS_MASK] =
- irq_timing_encode(ts, irq);
-
- timings->count++;
-}
-
-/*
- * The function record_irq_time is only called in one place in the
- * interrupts handler. We want this function always inline so the code
- * inside is embedded in the function and the static key branching
- * code can act at the higher level. Without the explicit
- * __always_inline we can end up with a function call and a small
- * overhead in the hotpath for nothing.
- */
-static __always_inline void record_irq_time(struct irq_desc *desc)
-{
- if (!static_branch_likely(&irq_timing_enabled))
- return;
-
- if (desc->istate & IRQS_TIMINGS)
- irq_timings_push(local_clock(), irq_desc_get_irq(desc));
-}
-#else
-static inline void irq_remove_timings(struct irq_desc *desc) {}
-static inline void irq_setup_timings(struct irq_desc *desc,
- struct irqaction *act) {};
-static inline void record_irq_time(struct irq_desc *desc) {}
-#endif /* CONFIG_IRQ_TIMINGS */
-
-
#ifdef CONFIG_GENERIC_IRQ_CHIP
void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
int num_ct, unsigned int irq_base,
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index f8e4e13dbe33..022b3741dd7a 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -78,8 +78,12 @@ static int alloc_masks(struct irq_desc *desc, int node)
return 0;
}
-static void desc_smp_init(struct irq_desc *desc, int node,
- const struct cpumask *affinity)
+static void irq_redirect_work(struct irq_work *work)
+{
+ handle_irq_desc(container_of(work, struct irq_desc, redirect.work));
+}
+
+static void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity)
{
if (!affinity)
affinity = irq_default_affinity;
@@ -91,6 +95,7 @@ static void desc_smp_init(struct irq_desc *desc, int node,
#ifdef CONFIG_NUMA
desc->irq_common_data.node = node;
#endif
+ desc->redirect.work = IRQ_WORK_INIT_HARD(irq_redirect_work);
}
static void free_masks(struct irq_desc *desc)
@@ -115,8 +120,6 @@ static inline void free_masks(struct irq_desc *desc) { }
static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
const struct cpumask *affinity, struct module *owner)
{
- int cpu;
-
desc->irq_common_data.handler_data = NULL;
desc->irq_common_data.msi_desc = NULL;
@@ -134,8 +137,6 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
desc->tot_count = 0;
desc->name = NULL;
desc->owner = owner;
- for_each_possible_cpu(cpu)
- *per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { };
desc_smp_init(desc, node, affinity);
}
@@ -621,9 +622,14 @@ EXPORT_SYMBOL(irq_to_desc);
static void free_desc(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
+ int cpu;
scoped_guard(raw_spinlock_irqsave, &desc->lock)
desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
+
+ for_each_possible_cpu(cpu)
+ *per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { };
+
delete_irq_desc(irq);
}
@@ -766,6 +772,83 @@ int generic_handle_domain_nmi(struct irq_domain *domain, irq_hw_number_t hwirq)
WARN_ON_ONCE(!in_nmi());
return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
}
+
+#ifdef CONFIG_SMP
+static bool demux_redirect_remote(struct irq_desc *desc)
+{
+ guard(raw_spinlock)(&desc->lock);
+ const struct cpumask *m = irq_data_get_effective_affinity_mask(&desc->irq_data);
+ unsigned int target_cpu = READ_ONCE(desc->redirect.target_cpu);
+
+ if (desc->irq_data.chip->irq_pre_redirect)
+ desc->irq_data.chip->irq_pre_redirect(&desc->irq_data);
+
+ /*
+ * If the interrupt handler is already running on a CPU that's included
+ * in the interrupt's affinity mask, redirection is not necessary.
+ */
+ if (cpumask_test_cpu(smp_processor_id(), m))
+ return false;
+
+ /*
+ * The desc->action check protects against IRQ shutdown: __free_irq() sets
+ * desc->action to NULL while holding desc->lock, which we also hold.
+ *
+ * Calling irq_work_queue_on() here is safe w.r.t. CPU unplugging:
+ * - takedown_cpu() schedules multi_cpu_stop() on all active CPUs,
+ * including the one that's taken down.
+ * - multi_cpu_stop() acts like a barrier, which means all active
+ * CPUs go through MULTI_STOP_DISABLE_IRQ and disable hard IRQs
+ * *before* the dying CPU runs take_cpu_down() in MULTI_STOP_RUN.
+ * - Hard IRQs are re-enabled at the end of multi_cpu_stop(), *after*
+ * the dying CPU has run take_cpu_down() in MULTI_STOP_RUN.
+ * - Since we run in hard IRQ context, we run either before or after
+ * take_cpu_down() but never concurrently.
+ * - If we run before take_cpu_down(), the dying CPU hasn't been marked
+ * offline yet (it's marked via take_cpu_down() -> __cpu_disable()),
+ * so the WARN in irq_work_queue_on() can't occur.
+ * - Furthermore, the work item we queue will be flushed later via
+ * take_cpu_down() -> cpuhp_invoke_callback_range_nofail() ->
+ * smpcfd_dying_cpu() -> irq_work_run().
+ * - If we run after take_cpu_down(), target_cpu has been already
+ * updated via take_cpu_down() -> __cpu_disable(), which eventually
+ * calls irq_do_set_affinity() during IRQ migration. So, target_cpu
+ * no longer points to the dying CPU in this case.
+ */
+ if (desc->action)
+ irq_work_queue_on(&desc->redirect.work, target_cpu);
+
+ return true;
+}
+#else /* CONFIG_SMP */
+static bool demux_redirect_remote(struct irq_desc *desc)
+{
+ return false;
+}
+#endif
+
+/**
+ * generic_handle_demux_domain_irq - Invoke the handler for a hardware interrupt
+ * of a demultiplexing domain.
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The hardware interrupt number to convert to a logical one
+ *
+ * Returns: True on success, or false if lookup has failed
+ */
+bool generic_handle_demux_domain_irq(struct irq_domain *domain, irq_hw_number_t hwirq)
+{
+ struct irq_desc *desc = irq_resolve_mapping(domain, hwirq);
+
+ if (unlikely(!desc))
+ return false;
+
+ if (demux_redirect_remote(desc))
+ return true;
+
+ return !handle_irq_desc(desc);
+}
+EXPORT_SYMBOL_GPL(generic_handle_demux_domain_irq);
+
#endif
/* Dynamic interrupt handling */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 2652c4cfd877..c2258b133939 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -33,6 +33,7 @@ static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq
struct irqchip_fwid {
struct fwnode_handle fwnode;
+ struct fwnode_handle *parent;
unsigned int type;
char *name;
phys_addr_t *pa;
@@ -53,8 +54,16 @@ static const char *irqchip_fwnode_get_name(const struct fwnode_handle *fwnode)
return fwid->name;
}
+static struct fwnode_handle *irqchip_fwnode_get_parent(const struct fwnode_handle *fwnode)
+{
+ struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
+
+ return fwid->parent;
+}
+
const struct fwnode_operations irqchip_fwnode_ops = {
.get_name = irqchip_fwnode_get_name,
+ .get_parent = irqchip_fwnode_get_parent,
};
EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
@@ -65,6 +74,7 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
* @id: Optional user provided id if name != NULL
* @name: Optional user provided domain name
* @pa: Optional user-provided physical address
+ * @parent: Optional parent fwnode_handle
*
* Allocate a struct irqchip_fwid, and return a pointer to the embedded
* fwnode_handle (or NULL on failure).
@@ -76,7 +86,8 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
*/
struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
const char *name,
- phys_addr_t *pa)
+ phys_addr_t *pa,
+ struct fwnode_handle *parent)
{
struct irqchip_fwid *fwid;
char *n;
@@ -104,6 +115,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
fwid->type = type;
fwid->name = n;
fwid->pa = pa;
+ fwid->parent = parent;
fwnode_init(&fwid->fwnode, &irqchip_fwnode_ops);
return &fwid->fwnode;
}
@@ -187,7 +199,7 @@ static int irq_domain_set_name(struct irq_domain *domain, const struct irq_domai
const struct fwnode_handle *fwnode = info->fwnode;
if (is_fwnode_irqchip(fwnode)) {
- struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
+ const struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
/*
* The name_suffix is only intended to be used to avoid a name
@@ -1901,6 +1913,7 @@ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
irq_domain_free_irq_data(virq, nr_irqs);
irq_free_descs(virq, nr_irqs);
}
+EXPORT_SYMBOL_GPL(irq_domain_free_irqs);
static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq)
{
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 349ae7979da0..cded3d960eb7 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -35,6 +35,16 @@ static int __init setup_forced_irqthreads(char *arg)
early_param("threadirqs", setup_forced_irqthreads);
#endif
+#ifdef CONFIG_SMP
+static inline void synchronize_irqwork(struct irq_desc *desc)
+{
+ /* Synchronize pending or on the fly redirect work */
+ irq_work_sync(&desc->redirect.work);
+}
+#else
+static inline void synchronize_irqwork(struct irq_desc *desc) { }
+#endif
+
static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state);
static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
@@ -107,7 +117,9 @@ EXPORT_SYMBOL(synchronize_hardirq);
static void __synchronize_irq(struct irq_desc *desc)
{
+ synchronize_irqwork(desc);
__synchronize_hardirq(desc, true);
+
/*
* We made sure that no hardirq handler is running. Now verify that no
* threaded handlers are active.
@@ -217,8 +229,7 @@ static inline void irq_validate_effective_affinity(struct irq_data *data) { }
static DEFINE_PER_CPU(struct cpumask, __tmp_mask);
-int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
- bool force)
+int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
{
struct cpumask *tmp_mask = this_cpu_ptr(&__tmp_mask);
struct irq_desc *desc = irq_data_to_desc(data);
@@ -347,6 +358,21 @@ static bool irq_set_affinity_deactivated(struct irq_data *data,
return true;
}
+/**
+ * irq_affinity_schedule_notify_work - Schedule work to notify about affinity change
+ * @desc: Interrupt descriptor whose affinity changed
+ */
+void irq_affinity_schedule_notify_work(struct irq_desc *desc)
+{
+ lockdep_assert_held(&desc->lock);
+
+ kref_get(&desc->affinity_notify->kref);
+ if (!schedule_work(&desc->affinity_notify->work)) {
+ /* Work was already scheduled, drop our extra ref */
+ kref_put(&desc->affinity_notify->kref, desc->affinity_notify->release);
+ }
+}
+
int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
bool force)
{
@@ -367,14 +393,9 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
irq_copy_pending(desc, mask);
}
- if (desc->affinity_notify) {
- kref_get(&desc->affinity_notify->kref);
- if (!schedule_work(&desc->affinity_notify->work)) {
- /* Work was already scheduled, drop our extra ref */
- kref_put(&desc->affinity_notify->kref,
- desc->affinity_notify->release);
- }
- }
+ if (desc->affinity_notify)
+ irq_affinity_schedule_notify_work(desc);
+
irqd_set(data, IRQD_AFFINITY_SET);
return ret;
@@ -1474,6 +1495,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
new->flags |= irqd_get_trigger_type(&desc->irq_data);
/*
+ * IRQF_ONESHOT means the interrupt source in the IRQ chip will be
+ * masked until the threaded handled is done. If there is no thread
+ * handler then it makes no sense to have IRQF_ONESHOT.
+ */
+ WARN_ON_ONCE(new->flags & IRQF_ONESHOT && !new->thread_fn);
+
+ /*
* Check whether the interrupt nests into another interrupt
* thread.
*/
@@ -1778,8 +1806,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
chip_bus_sync_unlock(desc);
mutex_unlock(&desc->request_mutex);
- irq_setup_timings(desc, new);
-
wake_up_and_wait_for_irq_thread_ready(desc, new);
wake_up_and_wait_for_irq_thread_ready(desc, new->secondary);
@@ -1950,7 +1976,6 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
irq_release_resources(desc);
chip_bus_sync_unlock(desc);
- irq_remove_timings(desc);
}
mutex_unlock(&desc->request_mutex);
@@ -2451,36 +2476,6 @@ void free_percpu_nmi(unsigned int irq, void __percpu *dev_id)
kfree(__free_percpu_irq(irq, dev_id));
}
-/**
- * setup_percpu_irq - setup a per-cpu interrupt
- * @irq: Interrupt line to setup
- * @act: irqaction for the interrupt
- *
- * Used to statically setup per-cpu interrupts in the early boot process.
- */
-int setup_percpu_irq(unsigned int irq, struct irqaction *act)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- int retval;
-
- if (!desc || !irq_settings_is_per_cpu_devid(desc))
- return -EINVAL;
-
- retval = irq_chip_pm_get(&desc->irq_data);
- if (retval < 0)
- return retval;
-
- if (!act->affinity)
- act->affinity = cpu_online_mask;
-
- retval = __setup_irq(irq, desc, act);
-
- if (retval)
- irq_chip_pm_put(&desc->irq_data);
-
- return retval;
-}
-
static
struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long flags,
const char *devname, const cpumask_t *affinity,
@@ -2513,10 +2508,9 @@ struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long f
}
/**
- * __request_percpu_irq - allocate a percpu interrupt line
+ * request_percpu_irq_affinity - allocate a percpu interrupt line
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs.
- * @flags: Interrupt type flags (IRQF_TIMER only)
* @devname: An ascii name for the claiming device
* @affinity: A cpumask describing the target CPUs for this interrupt
* @dev_id: A percpu cookie passed back to the handler function
@@ -2529,9 +2523,8 @@ struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long f
* the handler gets called with the interrupted CPU's instance of
* that variable.
*/
-int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
- unsigned long flags, const char *devname,
- const cpumask_t *affinity, void __percpu *dev_id)
+int request_percpu_irq_affinity(unsigned int irq, irq_handler_t handler, const char *devname,
+ const cpumask_t *affinity, void __percpu *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
@@ -2545,10 +2538,7 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
!irq_settings_is_per_cpu_devid(desc))
return -EINVAL;
- if (flags && flags != IRQF_TIMER)
- return -EINVAL;
-
- action = create_percpu_irqaction(handler, flags, devname, affinity, dev_id);
+ action = create_percpu_irqaction(handler, 0, devname, affinity, dev_id);
if (!action)
return -ENOMEM;
@@ -2567,7 +2557,7 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
return retval;
}
-EXPORT_SYMBOL_GPL(__request_percpu_irq);
+EXPORT_SYMBOL_GPL(request_percpu_irq_affinity);
/**
* request_percpu_nmi - allocate a percpu interrupt line for NMI delivery
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 77258eafbf63..b0999a4f1f68 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -12,6 +12,7 @@
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/mutex.h>
+#include <linux/string.h>
#include "internals.h"
@@ -317,7 +318,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
if (!desc->dir || action->dir || !action->name || !name_unique(irq, action))
return;
- snprintf(name, MAX_NAMELEN, "%s", action->name);
+ strscpy(name, action->name);
/* create /proc/irq/1234/handler/ */
action->dir = proc_mkdir(name, desc->dir);
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
deleted file mode 100644
index 4b7315e99bd6..000000000000
--- a/kernel/irq/timings.c
+++ /dev/null
@@ -1,959 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
-#define pr_fmt(fmt) "irq_timings: " fmt
-
-#include <linux/kernel.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/static_key.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/idr.h>
-#include <linux/irq.h>
-#include <linux/math64.h>
-#include <linux/log2.h>
-
-#include <trace/events/irq.h>
-
-#include "internals.h"
-
-DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
-
-DEFINE_PER_CPU(struct irq_timings, irq_timings);
-
-static DEFINE_IDR(irqt_stats);
-
-void irq_timings_enable(void)
-{
- static_branch_enable(&irq_timing_enabled);
-}
-
-void irq_timings_disable(void)
-{
- static_branch_disable(&irq_timing_enabled);
-}
-
-/*
- * The main goal of this algorithm is to predict the next interrupt
- * occurrence on the current CPU.
- *
- * Currently, the interrupt timings are stored in a circular array
- * buffer every time there is an interrupt, as a tuple: the interrupt
- * number and the associated timestamp when the event occurred <irq,
- * timestamp>.
- *
- * For every interrupt occurring in a short period of time, we can
- * measure the elapsed time between the occurrences for the same
- * interrupt and we end up with a suite of intervals. The experience
- * showed the interrupts are often coming following a periodic
- * pattern.
- *
- * The objective of the algorithm is to find out this periodic pattern
- * in a fastest way and use its period to predict the next irq event.
- *
- * When the next interrupt event is requested, we are in the situation
- * where the interrupts are disabled and the circular buffer
- * containing the timings is filled with the events which happened
- * after the previous next-interrupt-event request.
- *
- * At this point, we read the circular buffer and we fill the irq
- * related statistics structure. After this step, the circular array
- * containing the timings is empty because all the values are
- * dispatched in their corresponding buffers.
- *
- * Now for each interrupt, we can predict the next event by using the
- * suffix array, log interval and exponential moving average
- *
- * 1. Suffix array
- *
- * Suffix array is an array of all the suffixes of a string. It is
- * widely used as a data structure for compression, text search, ...
- * For instance for the word 'banana', the suffixes will be: 'banana'
- * 'anana' 'nana' 'ana' 'na' 'a'
- *
- * Usually, the suffix array is sorted but for our purpose it is
- * not necessary and won't provide any improvement in the context of
- * the solved problem where we clearly define the boundaries of the
- * search by a max period and min period.
- *
- * The suffix array will build a suite of intervals of different
- * length and will look for the repetition of each suite. If the suite
- * is repeating then we have the period because it is the length of
- * the suite whatever its position in the buffer.
- *
- * 2. Log interval
- *
- * We saw the irq timings allow to compute the interval of the
- * occurrences for a specific interrupt. We can reasonably assume the
- * longer is the interval, the higher is the error for the next event
- * and we can consider storing those interval values into an array
- * where each slot in the array correspond to an interval at the power
- * of 2 of the index. For example, index 12 will contain values
- * between 2^11 and 2^12.
- *
- * At the end we have an array of values where at each index defines a
- * [2^index - 1, 2 ^ index] interval values allowing to store a large
- * number of values inside a small array.
- *
- * For example, if we have the value 1123, then we store it at
- * ilog2(1123) = 10 index value.
- *
- * Storing those value at the specific index is done by computing an
- * exponential moving average for this specific slot. For instance,
- * for values 1800, 1123, 1453, ... fall under the same slot (10) and
- * the exponential moving average is computed every time a new value
- * is stored at this slot.
- *
- * 3. Exponential Moving Average
- *
- * The EMA is largely used to track a signal for stocks or as a low
- * pass filter. The magic of the formula, is it is very simple and the
- * reactivity of the average can be tuned with the factors called
- * alpha.
- *
- * The higher the alphas are, the faster the average respond to the
- * signal change. In our case, if a slot in the array is a big
- * interval, we can have numbers with a big difference between
- * them. The impact of those differences in the average computation
- * can be tuned by changing the alpha value.
- *
- *
- * -- The algorithm --
- *
- * We saw the different processing above, now let's see how they are
- * used together.
- *
- * For each interrupt:
- * For each interval:
- * Compute the index = ilog2(interval)
- * Compute a new_ema(buffer[index], interval)
- * Store the index in a circular buffer
- *
- * Compute the suffix array of the indexes
- *
- * For each suffix:
- * If the suffix is reverse-found 3 times
- * Return suffix
- *
- * Return Not found
- *
- * However we can not have endless suffix array to be build, it won't
- * make sense and it will add an extra overhead, so we can restrict
- * this to a maximum suffix length of 5 and a minimum suffix length of
- * 2. The experience showed 5 is the majority of the maximum pattern
- * period found for different devices.
- *
- * The result is a pattern finding less than 1us for an interrupt.
- *
- * Example based on real values:
- *
- * Example 1 : MMC write/read interrupt interval:
- *
- * 223947, 1240, 1384, 1386, 1386,
- * 217416, 1236, 1384, 1386, 1387,
- * 214719, 1241, 1386, 1387, 1384,
- * 213696, 1234, 1384, 1386, 1388,
- * 219904, 1240, 1385, 1389, 1385,
- * 212240, 1240, 1386, 1386, 1386,
- * 214415, 1236, 1384, 1386, 1387,
- * 214276, 1234, 1384, 1388, ?
- *
- * For each element, apply ilog2(value)
- *
- * 15, 8, 8, 8, 8,
- * 15, 8, 8, 8, 8,
- * 15, 8, 8, 8, 8,
- * 15, 8, 8, 8, 8,
- * 15, 8, 8, 8, 8,
- * 15, 8, 8, 8, 8,
- * 15, 8, 8, 8, 8,
- * 15, 8, 8, 8, ?
- *
- * Max period of 5, we take the last (max_period * 3) 15 elements as
- * we can be confident if the pattern repeats itself three times it is
- * a repeating pattern.
- *
- * 8,
- * 15, 8, 8, 8, 8,
- * 15, 8, 8, 8, 8,
- * 15, 8, 8, 8, ?
- *
- * Suffixes are:
- *
- * 1) 8, 15, 8, 8, 8 <- max period
- * 2) 8, 15, 8, 8
- * 3) 8, 15, 8
- * 4) 8, 15 <- min period
- *
- * From there we search the repeating pattern for each suffix.
- *
- * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8
- * | | | | | | | | | | | | | | |
- * 8, 15, 8, 8, 8 | | | | | | | | | |
- * 8, 15, 8, 8, 8 | | | | |
- * 8, 15, 8, 8, 8
- *
- * When moving the suffix, we found exactly 3 matches.
- *
- * The first suffix with period 5 is repeating.
- *
- * The next event is (3 * max_period) % suffix_period
- *
- * In this example, the result 0, so the next event is suffix[0] => 8
- *
- * However, 8 is the index in the array of exponential moving average
- * which was calculated on the fly when storing the values, so the
- * interval is ema[8] = 1366
- *
- *
- * Example 2:
- *
- * 4, 3, 5, 100,
- * 3, 3, 5, 117,
- * 4, 4, 5, 112,
- * 4, 3, 4, 110,
- * 3, 5, 3, 117,
- * 4, 4, 5, 112,
- * 4, 3, 4, 110,
- * 3, 4, 5, 112,
- * 4, 3, 4, 110
- *
- * ilog2
- *
- * 0, 0, 0, 4,
- * 0, 0, 0, 4,
- * 0, 0, 0, 4,
- * 0, 0, 0, 4,
- * 0, 0, 0, 4,
- * 0, 0, 0, 4,
- * 0, 0, 0, 4,
- * 0, 0, 0, 4,
- * 0, 0, 0, 4
- *
- * Max period 5:
- * 0, 0, 4,
- * 0, 0, 0, 4,
- * 0, 0, 0, 4,
- * 0, 0, 0, 4
- *
- * Suffixes:
- *
- * 1) 0, 0, 4, 0, 0
- * 2) 0, 0, 4, 0
- * 3) 0, 0, 4
- * 4) 0, 0
- *
- * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
- * | | | | | | X
- * 0, 0, 4, 0, 0, | X
- * 0, 0
- *
- * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
- * | | | | | | | | | | | | | | |
- * 0, 0, 4, 0, | | | | | | | | | | |
- * 0, 0, 4, 0, | | | | | | |
- * 0, 0, 4, 0, | | |
- * 0 0 4
- *
- * Pattern is found 3 times, the remaining is 1 which results from
- * (max_period * 3) % suffix_period. This value is the index in the
- * suffix arrays. The suffix array for a period 4 has the value 4
- * at index 1.
- */
-#define EMA_ALPHA_VAL 64
-#define EMA_ALPHA_SHIFT 7
-
-#define PREDICTION_PERIOD_MIN 3
-#define PREDICTION_PERIOD_MAX 5
-#define PREDICTION_FACTOR 4
-#define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */
-#define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */
-
-/*
- * Number of elements in the circular buffer: If it happens it was
- * flushed before, then the number of elements could be smaller than
- * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is
- * used as we wrapped. The index begins from zero when we did not
- * wrap. That could be done in a nicer way with the proper circular
- * array structure type but with the cost of extra computation in the
- * interrupt handler hot path. We choose efficiency.
- */
-#define for_each_irqts(i, irqts) \
- for (i = irqts->count < IRQ_TIMINGS_SIZE ? \
- 0 : irqts->count & IRQ_TIMINGS_MASK, \
- irqts->count = min(IRQ_TIMINGS_SIZE, \
- irqts->count); \
- irqts->count > 0; irqts->count--, \
- i = (i + 1) & IRQ_TIMINGS_MASK)
-
-struct irqt_stat {
- u64 last_ts;
- u64 ema_time[PREDICTION_BUFFER_SIZE];
- int timings[IRQ_TIMINGS_SIZE];
- int circ_timings[IRQ_TIMINGS_SIZE];
- int count;
-};
-
-/*
- * Exponential moving average computation
- */
-static u64 irq_timings_ema_new(u64 value, u64 ema_old)
-{
- s64 diff;
-
- if (unlikely(!ema_old))
- return value;
-
- diff = (value - ema_old) * EMA_ALPHA_VAL;
- /*
- * We can use a s64 type variable to be added with the u64
- * ema_old variable as this one will never have its topmost
- * bit set, it will be always smaller than 2^63 nanosec
- * interrupt interval (292 years).
- */
- return ema_old + (diff >> EMA_ALPHA_SHIFT);
-}
-
-static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
-{
- int period;
-
- /*
- * Move the beginning pointer to the end minus the max period x 3.
- * We are at the point we can begin searching the pattern
- */
- buffer = &buffer[len - (period_max * 3)];
-
- /* Adjust the length to the maximum allowed period x 3 */
- len = period_max * 3;
-
- /*
- * The buffer contains the suite of intervals, in a ilog2
- * basis, we are looking for a repetition. We point the
- * beginning of the search three times the length of the
- * period beginning at the end of the buffer. We do that for
- * each suffix.
- */
- for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) {
-
- /*
- * The first comparison always succeed because the
- * suffix is deduced from the first n-period bytes of
- * the buffer and we compare the initial suffix with
- * itself, so we can skip the first iteration.
- */
- int idx = period;
- size_t size = period;
-
- /*
- * We look if the suite with period 'i' repeat
- * itself. If it is truncated at the end, as it
- * repeats we can use the period to find out the next
- * element with the modulo.
- */
- while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) {
-
- /*
- * Move the index in a period basis
- */
- idx += size;
-
- /*
- * If this condition is reached, all previous
- * memcmp were successful, so the period is
- * found.
- */
- if (idx == len)
- return buffer[len % period];
-
- /*
- * If the remaining elements to compare are
- * smaller than the period, readjust the size
- * of the comparison for the last iteration.
- */
- if (len - idx < period)
- size = len - idx;
- }
- }
-
- return -1;
-}
-
-static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
-{
- int index, i, period_max, count, start, min = INT_MAX;
-
- if ((now - irqs->last_ts) >= NSEC_PER_SEC) {
- irqs->count = irqs->last_ts = 0;
- return U64_MAX;
- }
-
- /*
- * As we want to find three times the repetition, we need a
- * number of intervals greater or equal to three times the
- * maximum period, otherwise we truncate the max period.
- */
- period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ?
- PREDICTION_PERIOD_MAX : irqs->count / 3;
-
- /*
- * If we don't have enough irq timings for this prediction,
- * just bail out.
- */
- if (period_max <= PREDICTION_PERIOD_MIN)
- return U64_MAX;
-
- /*
- * 'count' will depends if the circular buffer wrapped or not
- */
- count = irqs->count < IRQ_TIMINGS_SIZE ?
- irqs->count : IRQ_TIMINGS_SIZE;
-
- start = irqs->count < IRQ_TIMINGS_SIZE ?
- 0 : (irqs->count & IRQ_TIMINGS_MASK);
-
- /*
- * Copy the content of the circular buffer into another buffer
- * in order to linearize the buffer instead of dealing with
- * wrapping indexes and shifted array which will be prone to
- * error and extremely difficult to debug.
- */
- for (i = 0; i < count; i++) {
- int index = (start + i) & IRQ_TIMINGS_MASK;
-
- irqs->timings[i] = irqs->circ_timings[index];
- min = min_t(int, irqs->timings[i], min);
- }
-
- index = irq_timings_next_event_index(irqs->timings, count, period_max);
- if (index < 0)
- return irqs->last_ts + irqs->ema_time[min];
-
- return irqs->last_ts + irqs->ema_time[index];
-}
-
-static __always_inline int irq_timings_interval_index(u64 interval)
-{
- /*
- * The PREDICTION_FACTOR increase the interval size for the
- * array of exponential average.
- */
- u64 interval_us = (interval >> 10) / PREDICTION_FACTOR;
-
- return likely(interval_us) ? ilog2(interval_us) : 0;
-}
-
-static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs,
- u64 interval)
-{
- int index;
-
- /*
- * Get the index in the ema table for this interrupt.
- */
- index = irq_timings_interval_index(interval);
-
- if (index > PREDICTION_BUFFER_SIZE - 1) {
- irqs->count = 0;
- return;
- }
-
- /*
- * Store the index as an element of the pattern in another
- * circular array.
- */
- irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
-
- irqs->ema_time[index] = irq_timings_ema_new(interval,
- irqs->ema_time[index]);
-
- irqs->count++;
-}
-
-static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
-{
- u64 old_ts = irqs->last_ts;
- u64 interval;
-
- /*
- * The timestamps are absolute time values, we need to compute
- * the timing interval between two interrupts.
- */
- irqs->last_ts = ts;
-
- /*
- * The interval type is u64 in order to deal with the same
- * type in our computation, that prevent mindfuck issues with
- * overflow, sign and division.
- */
- interval = ts - old_ts;
-
- /*
- * The interrupt triggered more than one second apart, that
- * ends the sequence as predictable for our purpose. In this
- * case, assume we have the beginning of a sequence and the
- * timestamp is the first value. As it is impossible to
- * predict anything at this point, return.
- *
- * Note the first timestamp of the sequence will always fall
- * in this test because the old_ts is zero. That is what we
- * want as we need another timestamp to compute an interval.
- */
- if (interval >= NSEC_PER_SEC) {
- irqs->count = 0;
- return;
- }
-
- __irq_timings_store(irq, irqs, interval);
-}
-
-/**
- * irq_timings_next_event - Return when the next event is supposed to arrive
- * @now: current time
- *
- * During the last busy cycle, the number of interrupts is incremented
- * and stored in the irq_timings structure. This information is
- * necessary to:
- *
- * - know if the index in the table wrapped up:
- *
- * If more than the array size interrupts happened during the
- * last busy/idle cycle, the index wrapped up and we have to
- * begin with the next element in the array which is the last one
- * in the sequence, otherwise it is at the index 0.
- *
- * - have an indication of the interrupts activity on this CPU
- * (eg. irq/sec)
- *
- * The values are 'consumed' after inserting in the statistical model,
- * thus the count is reinitialized.
- *
- * The array of values **must** be browsed in the time direction, the
- * timestamp must increase between an element and the next one.
- *
- * Returns a nanosec time based estimation of the earliest interrupt,
- * U64_MAX otherwise.
- */
-u64 irq_timings_next_event(u64 now)
-{
- struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
- struct irqt_stat *irqs;
- struct irqt_stat __percpu *s;
- u64 ts, next_evt = U64_MAX;
- int i, irq = 0;
-
- /*
- * This function must be called with the local irq disabled in
- * order to prevent the timings circular buffer to be updated
- * while we are reading it.
- */
- lockdep_assert_irqs_disabled();
-
- if (!irqts->count)
- return next_evt;
-
- /*
- * Number of elements in the circular buffer: If it happens it
- * was flushed before, then the number of elements could be
- * smaller than IRQ_TIMINGS_SIZE, so the count is used,
- * otherwise the array size is used as we wrapped. The index
- * begins from zero when we did not wrap. That could be done
- * in a nicer way with the proper circular array structure
- * type but with the cost of extra computation in the
- * interrupt handler hot path. We choose efficiency.
- *
- * Inject measured irq/timestamp to the pattern prediction
- * model while decrementing the counter because we consume the
- * data from our circular buffer.
- */
- for_each_irqts(i, irqts) {
- irq = irq_timing_decode(irqts->values[i], &ts);
- s = idr_find(&irqt_stats, irq);
- if (s)
- irq_timings_store(irq, this_cpu_ptr(s), ts);
- }
-
- /*
- * Look in the list of interrupts' statistics, the earliest
- * next event.
- */
- idr_for_each_entry(&irqt_stats, s, i) {
-
- irqs = this_cpu_ptr(s);
-
- ts = __irq_timings_next_event(irqs, i, now);
- if (ts <= now)
- return now;
-
- if (ts < next_evt)
- next_evt = ts;
- }
-
- return next_evt;
-}
-
-void irq_timings_free(int irq)
-{
- struct irqt_stat __percpu *s;
-
- s = idr_find(&irqt_stats, irq);
- if (s) {
- free_percpu(s);
- idr_remove(&irqt_stats, irq);
- }
-}
-
-int irq_timings_alloc(int irq)
-{
- struct irqt_stat __percpu *s;
- int id;
-
- /*
- * Some platforms can have the same private interrupt per cpu,
- * so this function may be called several times with the
- * same interrupt number. Just bail out in case the per cpu
- * stat structure is already allocated.
- */
- s = idr_find(&irqt_stats, irq);
- if (s)
- return 0;
-
- s = alloc_percpu(*s);
- if (!s)
- return -ENOMEM;
-
- idr_preload(GFP_KERNEL);
- id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT);
- idr_preload_end();
-
- if (id < 0) {
- free_percpu(s);
- return id;
- }
-
- return 0;
-}
-
-#ifdef CONFIG_TEST_IRQ_TIMINGS
-struct timings_intervals {
- u64 *intervals;
- size_t count;
-};
-
-/*
- * Intervals are given in nanosecond base
- */
-static u64 intervals0[] __initdata = {
- 10000, 50000, 200000, 500000,
- 10000, 50000, 200000, 500000,
- 10000, 50000, 200000, 500000,
- 10000, 50000, 200000, 500000,
- 10000, 50000, 200000, 500000,
- 10000, 50000, 200000, 500000,
- 10000, 50000, 200000, 500000,
- 10000, 50000, 200000, 500000,
- 10000, 50000, 200000,
-};
-
-static u64 intervals1[] __initdata = {
- 223947000, 1240000, 1384000, 1386000, 1386000,
- 217416000, 1236000, 1384000, 1386000, 1387000,
- 214719000, 1241000, 1386000, 1387000, 1384000,
- 213696000, 1234000, 1384000, 1386000, 1388000,
- 219904000, 1240000, 1385000, 1389000, 1385000,
- 212240000, 1240000, 1386000, 1386000, 1386000,
- 214415000, 1236000, 1384000, 1386000, 1387000,
- 214276000, 1234000,
-};
-
-static u64 intervals2[] __initdata = {
- 4000, 3000, 5000, 100000,
- 3000, 3000, 5000, 117000,
- 4000, 4000, 5000, 112000,
- 4000, 3000, 4000, 110000,
- 3000, 5000, 3000, 117000,
- 4000, 4000, 5000, 112000,
- 4000, 3000, 4000, 110000,
- 3000, 4000, 5000, 112000,
- 4000,
-};
-
-static u64 intervals3[] __initdata = {
- 1385000, 212240000, 1240000,
- 1386000, 214415000, 1236000,
- 1384000, 214276000, 1234000,
- 1386000, 214415000, 1236000,
- 1385000, 212240000, 1240000,
- 1386000, 214415000, 1236000,
- 1384000, 214276000, 1234000,
- 1386000, 214415000, 1236000,
- 1385000, 212240000, 1240000,
-};
-
-static u64 intervals4[] __initdata = {
- 10000, 50000, 10000, 50000,
- 10000, 50000, 10000, 50000,
- 10000, 50000, 10000, 50000,
- 10000, 50000, 10000, 50000,
- 10000, 50000, 10000, 50000,
- 10000, 50000, 10000, 50000,
- 10000, 50000, 10000, 50000,
- 10000, 50000, 10000, 50000,
- 10000,
-};
-
-static struct timings_intervals tis[] __initdata = {
- { intervals0, ARRAY_SIZE(intervals0) },
- { intervals1, ARRAY_SIZE(intervals1) },
- { intervals2, ARRAY_SIZE(intervals2) },
- { intervals3, ARRAY_SIZE(intervals3) },
- { intervals4, ARRAY_SIZE(intervals4) },
-};
-
-static int __init irq_timings_test_next_index(struct timings_intervals *ti)
-{
- int _buffer[IRQ_TIMINGS_SIZE];
- int buffer[IRQ_TIMINGS_SIZE];
- int index, start, i, count, period_max;
-
- count = ti->count - 1;
-
- period_max = count > (3 * PREDICTION_PERIOD_MAX) ?
- PREDICTION_PERIOD_MAX : count / 3;
-
- /*
- * Inject all values except the last one which will be used
- * to compare with the next index result.
- */
- pr_debug("index suite: ");
-
- for (i = 0; i < count; i++) {
- index = irq_timings_interval_index(ti->intervals[i]);
- _buffer[i & IRQ_TIMINGS_MASK] = index;
- pr_cont("%d ", index);
- }
-
- start = count < IRQ_TIMINGS_SIZE ? 0 :
- count & IRQ_TIMINGS_MASK;
-
- count = min_t(int, count, IRQ_TIMINGS_SIZE);
-
- for (i = 0; i < count; i++) {
- int index = (start + i) & IRQ_TIMINGS_MASK;
- buffer[i] = _buffer[index];
- }
-
- index = irq_timings_next_event_index(buffer, count, period_max);
- i = irq_timings_interval_index(ti->intervals[ti->count - 1]);
-
- if (index != i) {
- pr_err("Expected (%d) and computed (%d) next indexes differ\n",
- i, index);
- return -EINVAL;
- }
-
- return 0;
-}
-
-static int __init irq_timings_next_index_selftest(void)
-{
- int i, ret;
-
- for (i = 0; i < ARRAY_SIZE(tis); i++) {
-
- pr_info("---> Injecting intervals number #%d (count=%zd)\n",
- i, tis[i].count);
-
- ret = irq_timings_test_next_index(&tis[i]);
- if (ret)
- break;
- }
-
- return ret;
-}
-
-static int __init irq_timings_test_irqs(struct timings_intervals *ti)
-{
- struct irqt_stat __percpu *s;
- struct irqt_stat *irqs;
- int i, index, ret, irq = 0xACE5;
-
- ret = irq_timings_alloc(irq);
- if (ret) {
- pr_err("Failed to allocate irq timings\n");
- return ret;
- }
-
- s = idr_find(&irqt_stats, irq);
- if (!s) {
- ret = -EIDRM;
- goto out;
- }
-
- irqs = this_cpu_ptr(s);
-
- for (i = 0; i < ti->count; i++) {
-
- index = irq_timings_interval_index(ti->intervals[i]);
- pr_debug("%d: interval=%llu ema_index=%d\n",
- i, ti->intervals[i], index);
-
- __irq_timings_store(irq, irqs, ti->intervals[i]);
- if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) {
- ret = -EBADSLT;
- pr_err("Failed to store in the circular buffer\n");
- goto out;
- }
- }
-
- if (irqs->count != ti->count) {
- ret = -ERANGE;
- pr_err("Count differs\n");
- goto out;
- }
-
- ret = 0;
-out:
- irq_timings_free(irq);
-
- return ret;
-}
-
-static int __init irq_timings_irqs_selftest(void)
-{
- int i, ret;
-
- for (i = 0; i < ARRAY_SIZE(tis); i++) {
- pr_info("---> Injecting intervals number #%d (count=%zd)\n",
- i, tis[i].count);
- ret = irq_timings_test_irqs(&tis[i]);
- if (ret)
- break;
- }
-
- return ret;
-}
-
-static int __init irq_timings_test_irqts(struct irq_timings *irqts,
- unsigned count)
-{
- int start = count >= IRQ_TIMINGS_SIZE ? count - IRQ_TIMINGS_SIZE : 0;
- int i, irq, oirq = 0xBEEF;
- u64 ots = 0xDEAD, ts;
-
- /*
- * Fill the circular buffer by using the dedicated function.
- */
- for (i = 0; i < count; i++) {
- pr_debug("%d: index=%d, ts=%llX irq=%X\n",
- i, i & IRQ_TIMINGS_MASK, ots + i, oirq + i);
-
- irq_timings_push(ots + i, oirq + i);
- }
-
- /*
- * Compute the first elements values after the index wrapped
- * up or not.
- */
- ots += start;
- oirq += start;
-
- /*
- * Test the circular buffer count is correct.
- */
- pr_debug("---> Checking timings array count (%d) is right\n", count);
- if (WARN_ON(irqts->count != count))
- return -EINVAL;
-
- /*
- * Test the macro allowing to browse all the irqts.
- */
- pr_debug("---> Checking the for_each_irqts() macro\n");
- for_each_irqts(i, irqts) {
-
- irq = irq_timing_decode(irqts->values[i], &ts);
-
- pr_debug("index=%d, ts=%llX / %llX, irq=%X / %X\n",
- i, ts, ots, irq, oirq);
-
- if (WARN_ON(ts != ots || irq != oirq))
- return -EINVAL;
-
- ots++; oirq++;
- }
-
- /*
- * The circular buffer should have be flushed when browsed
- * with for_each_irqts
- */
- pr_debug("---> Checking timings array is empty after browsing it\n");
- if (WARN_ON(irqts->count))
- return -EINVAL;
-
- return 0;
-}
-
-static int __init irq_timings_irqts_selftest(void)
-{
- struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
- int i, ret;
-
- /*
- * Test the circular buffer with different number of
- * elements. The purpose is to test at the limits (empty, half
- * full, full, wrapped with the cursor at the boundaries,
- * wrapped several times, etc ...
- */
- int count[] = { 0,
- IRQ_TIMINGS_SIZE >> 1,
- IRQ_TIMINGS_SIZE,
- IRQ_TIMINGS_SIZE + (IRQ_TIMINGS_SIZE >> 1),
- 2 * IRQ_TIMINGS_SIZE,
- (2 * IRQ_TIMINGS_SIZE) + 3,
- };
-
- for (i = 0; i < ARRAY_SIZE(count); i++) {
-
- pr_info("---> Checking the timings with %d/%d values\n",
- count[i], IRQ_TIMINGS_SIZE);
-
- ret = irq_timings_test_irqts(irqts, count[i]);
- if (ret)
- break;
- }
-
- return ret;
-}
-
-static int __init irq_timings_selftest(void)
-{
- int ret;
-
- pr_info("------------------- selftest start -----------------\n");
-
- /*
- * At this point, we don't except any subsystem to use the irq
- * timings but us, so it should not be enabled.
- */
- if (static_branch_unlikely(&irq_timing_enabled)) {
- pr_warn("irq timings already initialized, skipping selftest\n");
- return 0;
- }
-
- ret = irq_timings_irqts_selftest();
- if (ret)
- goto out;
-
- ret = irq_timings_irqs_selftest();
- if (ret)
- goto out;
-
- ret = irq_timings_next_index_selftest();
-out:
- pr_info("---------- selftest end with %s -----------\n",
- ret ? "failure" : "success");
-
- return ret;
-}
-early_initcall(irq_timings_selftest);
-#endif
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 049e296f586c..aec2f06858af 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -151,8 +151,10 @@ static unsigned int get_symbol_offset(unsigned long pos)
unsigned long kallsyms_sym_address(int idx)
{
- /* values are unsigned offsets */
- return kallsyms_relative_base + (u32)kallsyms_offsets[idx];
+ /* non-relocatable 32-bit kernels just embed the value directly */
+ if (!IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_RELOCATABLE))
+ return (u32)kallsyms_offsets[idx];
+ return (unsigned long)offset_to_ptr(kallsyms_offsets + idx);
}
static unsigned int get_symbol_seq(int index)
@@ -345,7 +347,7 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
return 1;
}
return !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf) ||
- !!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
+ !!bpf_address_lookup(addr, symbolsize, offset, namebuf);
}
static int kallsyms_lookup_buildid(unsigned long addr,
@@ -355,8 +357,21 @@ static int kallsyms_lookup_buildid(unsigned long addr,
{
int ret;
- namebuf[KSYM_NAME_LEN - 1] = 0;
+ /*
+ * kallsyms_lookus() returns pointer to namebuf on success and
+ * NULL on error. But some callers ignore the return value.
+ * Instead they expect @namebuf filled either with valid
+ * or empty string.
+ */
namebuf[0] = 0;
+ /*
+ * Initialize the module-related return values. They are not set
+ * when the symbol is in vmlinux or it is a bpf address.
+ */
+ if (modname)
+ *modname = NULL;
+ if (modbuildid)
+ *modbuildid = NULL;
if (is_ksym_addr(addr)) {
unsigned long pos;
@@ -365,10 +380,6 @@ static int kallsyms_lookup_buildid(unsigned long addr,
/* Grab name */
kallsyms_expand_symbol(get_symbol_offset(pos),
namebuf, KSYM_NAME_LEN);
- if (modname)
- *modname = NULL;
- if (modbuildid)
- *modbuildid = NULL;
return strlen(namebuf);
}
@@ -377,12 +388,11 @@ static int kallsyms_lookup_buildid(unsigned long addr,
ret = module_address_lookup(addr, symbolsize, offset,
modname, modbuildid, namebuf);
if (!ret)
- ret = bpf_address_lookup(addr, symbolsize,
- offset, modname, namebuf);
+ ret = bpf_address_lookup(addr, symbolsize, offset, namebuf);
if (!ret)
- ret = ftrace_mod_address_lookup(addr, symbolsize,
- offset, modname, namebuf);
+ ret = ftrace_mod_address_lookup(addr, symbolsize, offset,
+ modname, modbuildid, namebuf);
return ret;
}
@@ -426,6 +436,37 @@ int lookup_symbol_name(unsigned long addr, char *symname)
return lookup_module_symbol_name(addr, symname);
}
+#ifdef CONFIG_STACKTRACE_BUILD_ID
+
+static int append_buildid(char *buffer, const char *modname,
+ const unsigned char *buildid)
+{
+ if (!modname)
+ return 0;
+
+ if (!buildid) {
+ pr_warn_once("Undefined buildid for the module %s\n", modname);
+ return 0;
+ }
+
+ /* build ID should match length of sprintf */
+#ifdef CONFIG_MODULES
+ static_assert(sizeof(typeof_member(struct module, build_id)) == 20);
+#endif
+
+ return sprintf(buffer, " %20phN", buildid);
+}
+
+#else /* CONFIG_STACKTRACE_BUILD_ID */
+
+static int append_buildid(char *buffer, const char *modname,
+ const unsigned char *buildid)
+{
+ return 0;
+}
+
+#endif /* CONFIG_STACKTRACE_BUILD_ID */
+
/* Look up a kernel symbol and return it in a text buffer. */
static int __sprint_symbol(char *buffer, unsigned long address,
int symbol_offset, int add_offset, int add_buildid)
@@ -435,6 +476,9 @@ static int __sprint_symbol(char *buffer, unsigned long address,
unsigned long offset, size;
int len;
+ /* Prevent module removal until modname and modbuildid are printed */
+ guard(rcu)();
+
address += symbol_offset;
len = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid,
buffer);
@@ -448,15 +492,8 @@ static int __sprint_symbol(char *buffer, unsigned long address,
if (modname) {
len += sprintf(buffer + len, " [%s", modname);
-#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
- if (add_buildid && buildid) {
- /* build ID should match length of sprintf */
-#if IS_ENABLED(CONFIG_MODULES)
- static_assert(sizeof(typeof_member(struct module, build_id)) == 20);
-#endif
- len += sprintf(buffer + len, " %20phN", buildid);
- }
-#endif
+ if (add_buildid)
+ len += append_buildid(buffer + len, modname, buildid);
len += sprintf(buffer + len, "]");
}
diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h
index 9633782f8250..81a867dbe57d 100644
--- a/kernel/kallsyms_internal.h
+++ b/kernel/kallsyms_internal.h
@@ -8,7 +8,6 @@ extern const int kallsyms_offsets[];
extern const u8 kallsyms_names[];
extern const unsigned int kallsyms_num_syms;
-extern const unsigned long kallsyms_relative_base;
extern const char kallsyms_token_table[];
extern const u16 kallsyms_token_index[];
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 6563141f5de9..5397d0c14127 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -55,13 +55,13 @@ struct kcov {
refcount_t refcount;
/* The lock protects mode, size, area and t. */
spinlock_t lock;
- enum kcov_mode mode;
+ enum kcov_mode mode __guarded_by(&lock);
/* Size of arena (in long's). */
- unsigned int size;
+ unsigned int size __guarded_by(&lock);
/* Coverage buffer shared with user space. */
- void *area;
+ void *area __guarded_by(&lock);
/* Task for which we collect coverage, or NULL. */
- struct task_struct *t;
+ struct task_struct *t __guarded_by(&lock);
/* Collecting coverage from remote (background) threads. */
bool remote;
/* Size of remote area (in long's). */
@@ -391,6 +391,7 @@ void kcov_task_init(struct task_struct *t)
}
static void kcov_reset(struct kcov *kcov)
+ __must_hold(&kcov->lock)
{
kcov->t = NULL;
kcov->mode = KCOV_MODE_INIT;
@@ -400,6 +401,7 @@ static void kcov_reset(struct kcov *kcov)
}
static void kcov_remote_reset(struct kcov *kcov)
+ __must_hold(&kcov->lock)
{
int bkt;
struct kcov_remote *remote;
@@ -419,6 +421,7 @@ static void kcov_remote_reset(struct kcov *kcov)
}
static void kcov_disable(struct task_struct *t, struct kcov *kcov)
+ __must_hold(&kcov->lock)
{
kcov_task_reset(t);
if (kcov->remote)
@@ -435,8 +438,11 @@ static void kcov_get(struct kcov *kcov)
static void kcov_put(struct kcov *kcov)
{
if (refcount_dec_and_test(&kcov->refcount)) {
- kcov_remote_reset(kcov);
- vfree(kcov->area);
+ /* Context-safety: no references left, object being destroyed. */
+ context_unsafe(
+ kcov_remote_reset(kcov);
+ vfree(kcov->area);
+ );
kfree(kcov);
}
}
@@ -491,6 +497,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
unsigned long size, off;
struct page *page;
unsigned long flags;
+ void *area;
spin_lock_irqsave(&kcov->lock, flags);
size = kcov->size * sizeof(unsigned long);
@@ -499,10 +506,11 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
res = -EINVAL;
goto exit;
}
+ area = kcov->area;
spin_unlock_irqrestore(&kcov->lock, flags);
vm_flags_set(vma, VM_DONTEXPAND);
for (off = 0; off < size; off += PAGE_SIZE) {
- page = vmalloc_to_page(kcov->area + off);
+ page = vmalloc_to_page(area + off);
res = vm_insert_page(vma, vma->vm_start + off, page);
if (res) {
pr_warn_once("kcov: vm_insert_page() failed\n");
@@ -522,10 +530,10 @@ static int kcov_open(struct inode *inode, struct file *filep)
kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
if (!kcov)
return -ENOMEM;
+ guard(spinlock_init)(&kcov->lock);
kcov->mode = KCOV_MODE_DISABLED;
kcov->sequence = 1;
refcount_set(&kcov->refcount, 1);
- spin_lock_init(&kcov->lock);
filep->private_data = kcov;
return nonseekable_open(inode, filep);
}
@@ -556,6 +564,7 @@ static int kcov_get_mode(unsigned long arg)
* vmalloc fault handling path is instrumented.
*/
static void kcov_fault_in_area(struct kcov *kcov)
+ __must_hold(&kcov->lock)
{
unsigned long stride = PAGE_SIZE / sizeof(unsigned long);
unsigned long *area = kcov->area;
@@ -584,6 +593,7 @@ static inline bool kcov_check_handle(u64 handle, bool common_valid,
static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
unsigned long arg)
+ __must_hold(&kcov->lock)
{
struct task_struct *t;
unsigned long flags, unused;
@@ -814,6 +824,7 @@ static inline bool kcov_mode_enabled(unsigned int mode)
}
static void kcov_remote_softirq_start(struct task_struct *t)
+ __must_hold(&kcov_percpu_data.lock)
{
struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
unsigned int mode;
@@ -831,6 +842,7 @@ static void kcov_remote_softirq_start(struct task_struct *t)
}
static void kcov_remote_softirq_stop(struct task_struct *t)
+ __must_hold(&kcov_percpu_data.lock)
{
struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
@@ -896,10 +908,12 @@ void kcov_remote_start(u64 handle)
/* Put in kcov_remote_stop(). */
kcov_get(kcov);
/*
- * Read kcov fields before unlock to prevent races with
- * KCOV_DISABLE / kcov_remote_reset().
+ * Read kcov fields before unlocking kcov_remote_lock to prevent races
+ * with KCOV_DISABLE and kcov_remote_reset(); cannot acquire kcov->lock
+ * here, because it might lead to deadlock given kcov_remote_lock is
+ * acquired _after_ kcov->lock elsewhere.
*/
- mode = kcov->mode;
+ mode = context_unsafe(kcov->mode);
sequence = kcov->sequence;
if (in_task()) {
size = kcov->remote_size;
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index a45f3dfc8d14..824f30c93252 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -1,4 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
+CONTEXT_ANALYSIS := y
+
KCSAN_SANITIZE := n
KCOV_INSTRUMENT := n
UBSAN_SANITIZE := n
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 219d22857c98..8ef8167be745 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -176,7 +176,7 @@ static bool __report_matches(const struct expect_report *r)
/* Title */
cur = expect[0];
- end = &expect[0][sizeof(expect[0]) - 1];
+ end = ARRAY_END(expect[0]);
cur += scnprintf(cur, end - cur, "BUG: KCSAN: %s in ",
is_assert ? "assert: race" : "data-race");
if (r->access[1].fn) {
@@ -200,7 +200,7 @@ static bool __report_matches(const struct expect_report *r)
/* Access 1 */
cur = expect[1];
- end = &expect[1][sizeof(expect[1]) - 1];
+ end = ARRAY_END(expect[1]);
if (!r->access[1].fn)
cur += scnprintf(cur, end - cur, "race at unknown origin, with ");
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index e95ce7d7a76e..11a48b78f8d1 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -116,6 +116,7 @@ static DEFINE_RAW_SPINLOCK(report_lock);
* been reported since (now - KCSAN_REPORT_ONCE_IN_MS).
*/
static bool rate_limit_report(unsigned long frame1, unsigned long frame2)
+ __must_hold(&report_lock)
{
struct report_time *use_entry = &report_times[0];
unsigned long invalid_before;
@@ -366,6 +367,7 @@ static int sym_strcmp(void *addr1, void *addr2)
static void
print_stack_trace(unsigned long stack_entries[], int num_entries, unsigned long reordered_to)
+ __must_hold(&report_lock)
{
stack_trace_print(stack_entries, num_entries, 0);
if (reordered_to)
@@ -373,6 +375,7 @@ print_stack_trace(unsigned long stack_entries[], int num_entries, unsigned long
}
static void print_verbose_info(struct task_struct *task)
+ __must_hold(&report_lock)
{
if (!task)
return;
@@ -389,6 +392,7 @@ static void print_report(enum kcsan_value_change value_change,
const struct access_info *ai,
struct other_info *other_info,
u64 old, u64 new, u64 mask)
+ __must_hold(&report_lock)
{
unsigned long reordered_to = 0;
unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 };
@@ -496,6 +500,7 @@ static void print_report(enum kcsan_value_change value_change,
}
static void release_report(unsigned long *flags, struct other_info *other_info)
+ __releases(&report_lock)
{
/*
* Use size to denote valid/invalid, since KCSAN entirely ignores
@@ -507,13 +512,11 @@ static void release_report(unsigned long *flags, struct other_info *other_info)
/*
* Sets @other_info->task and awaits consumption of @other_info.
- *
- * Precondition: report_lock is held.
- * Postcondition: report_lock is held.
*/
static void set_other_info_task_blocking(unsigned long *flags,
const struct access_info *ai,
struct other_info *other_info)
+ __must_hold(&report_lock)
{
/*
* We may be instrumenting a code-path where current->state is already
@@ -572,6 +575,7 @@ static void set_other_info_task_blocking(unsigned long *flags,
static void prepare_report_producer(unsigned long *flags,
const struct access_info *ai,
struct other_info *other_info)
+ __must_not_hold(&report_lock)
{
raw_spin_lock_irqsave(&report_lock, *flags);
@@ -603,6 +607,7 @@ static void prepare_report_producer(unsigned long *flags,
static bool prepare_report_consumer(unsigned long *flags,
const struct access_info *ai,
struct other_info *other_info)
+ __cond_acquires(true, &report_lock)
{
raw_spin_lock_irqsave(&report_lock, *flags);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index eb62a9794242..2bfbb2d144e6 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -883,6 +883,60 @@ out_free_sha_regions:
#ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY
/*
+ * kexec_purgatory_find_symbol - find a symbol in the purgatory
+ * @pi: Purgatory to search in.
+ * @name: Name of the symbol.
+ *
+ * Return: pointer to symbol in read-only symtab on success, NULL on error.
+ */
+static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+ const char *name)
+{
+ const Elf_Shdr *sechdrs;
+ const Elf_Ehdr *ehdr;
+ const Elf_Sym *syms;
+ const char *strtab;
+ int i, k;
+
+ if (!pi->ehdr)
+ return NULL;
+
+ ehdr = pi->ehdr;
+ sechdrs = (void *)ehdr + ehdr->e_shoff;
+
+ for (i = 0; i < ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type != SHT_SYMTAB)
+ continue;
+
+ if (sechdrs[i].sh_link >= ehdr->e_shnum)
+ /* Invalid strtab section number */
+ continue;
+ strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset;
+ syms = (void *)ehdr + sechdrs[i].sh_offset;
+
+ /* Go through symbols for a match */
+ for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+ if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+ continue;
+
+ if (strcmp(strtab + syms[k].st_name, name) != 0)
+ continue;
+
+ if (syms[k].st_shndx == SHN_UNDEF ||
+ syms[k].st_shndx >= ehdr->e_shnum) {
+ pr_debug("Symbol: %s has bad section index %d.\n",
+ name, syms[k].st_shndx);
+ return NULL;
+ }
+
+ /* Found the symbol we are looking for */
+ return &syms[k];
+ }
+ }
+
+ return NULL;
+}
+/*
* kexec_purgatory_setup_kbuf - prepare buffer to load purgatory.
* @pi: Purgatory to be loaded.
* @kbuf: Buffer to setup.
@@ -960,6 +1014,10 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
unsigned long offset;
size_t sechdrs_size;
Elf_Shdr *sechdrs;
+ const Elf_Sym *entry_sym;
+ u16 entry_shndx = 0;
+ unsigned long entry_off = 0;
+ bool start_fixed = false;
int i;
/*
@@ -977,6 +1035,12 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
bss_addr = kbuf->mem + kbuf->bufsz;
kbuf->image->start = pi->ehdr->e_entry;
+ entry_sym = kexec_purgatory_find_symbol(pi, "purgatory_start");
+ if (entry_sym) {
+ entry_shndx = entry_sym->st_shndx;
+ entry_off = entry_sym->st_value;
+ }
+
for (i = 0; i < pi->ehdr->e_shnum; i++) {
unsigned long align;
void *src, *dst;
@@ -994,6 +1058,13 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
offset = ALIGN(offset, align);
+ if (!start_fixed && entry_sym && i == entry_shndx &&
+ (sechdrs[i].sh_flags & SHF_EXECINSTR) &&
+ entry_off < sechdrs[i].sh_size) {
+ kbuf->image->start = kbuf->mem + offset + entry_off;
+ start_fixed = true;
+ }
+
/*
* Check if the segment contains the entry point, if so,
* calculate the value of image->start based on it.
@@ -1004,13 +1075,14 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
* is not set to the initial value, and warn the user so they
* have a chance to fix their purgatory's linker script.
*/
- if (sechdrs[i].sh_flags & SHF_EXECINSTR &&
+ if (!start_fixed && sechdrs[i].sh_flags & SHF_EXECINSTR &&
pi->ehdr->e_entry >= sechdrs[i].sh_addr &&
pi->ehdr->e_entry < (sechdrs[i].sh_addr
+ sechdrs[i].sh_size) &&
- !WARN_ON(kbuf->image->start != pi->ehdr->e_entry)) {
+ kbuf->image->start == pi->ehdr->e_entry) {
kbuf->image->start -= sechdrs[i].sh_addr;
kbuf->image->start += kbuf->mem + offset;
+ start_fixed = true;
}
src = (void *)pi->ehdr + sechdrs[i].sh_offset;
@@ -1128,61 +1200,6 @@ out_free_kbuf:
return ret;
}
-/*
- * kexec_purgatory_find_symbol - find a symbol in the purgatory
- * @pi: Purgatory to search in.
- * @name: Name of the symbol.
- *
- * Return: pointer to symbol in read-only symtab on success, NULL on error.
- */
-static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
- const char *name)
-{
- const Elf_Shdr *sechdrs;
- const Elf_Ehdr *ehdr;
- const Elf_Sym *syms;
- const char *strtab;
- int i, k;
-
- if (!pi->ehdr)
- return NULL;
-
- ehdr = pi->ehdr;
- sechdrs = (void *)ehdr + ehdr->e_shoff;
-
- for (i = 0; i < ehdr->e_shnum; i++) {
- if (sechdrs[i].sh_type != SHT_SYMTAB)
- continue;
-
- if (sechdrs[i].sh_link >= ehdr->e_shnum)
- /* Invalid strtab section number */
- continue;
- strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset;
- syms = (void *)ehdr + sechdrs[i].sh_offset;
-
- /* Go through symbols for a match */
- for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
- if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
- continue;
-
- if (strcmp(strtab + syms[k].st_name, name) != 0)
- continue;
-
- if (syms[k].st_shndx == SHN_UNDEF ||
- syms[k].st_shndx >= ehdr->e_shnum) {
- pr_debug("Symbol: %s has bad section index %d.\n",
- name, syms[k].st_shndx);
- return NULL;
- }
-
- /* Found the symbol we are looking for */
- return &syms[k];
- }
- }
-
- return NULL;
-}
-
void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
{
struct purgatory_info *pi = &image->purgatory_info;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 39511dd2abc9..c9507689e181 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -35,8 +35,8 @@ static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
-static LIST_HEAD(kthreads_hotplug);
-static DEFINE_MUTEX(kthreads_hotplug_lock);
+static LIST_HEAD(kthread_affinity_list);
+static DEFINE_MUTEX(kthread_affinity_lock);
struct kthread_create_info
{
@@ -69,7 +69,7 @@ struct kthread {
/* To store the full name if task comm is truncated. */
char *full_name;
struct task_struct *task;
- struct list_head hotplug_node;
+ struct list_head affinity_node;
struct cpumask *preferred_affinity;
};
@@ -128,7 +128,7 @@ bool set_kthread_struct(struct task_struct *p)
init_completion(&kthread->exited);
init_completion(&kthread->parked);
- INIT_LIST_HEAD(&kthread->hotplug_node);
+ INIT_LIST_HEAD(&kthread->affinity_node);
p->vfork_done = &kthread->exited;
kthread->task = p;
@@ -323,10 +323,10 @@ void __noreturn kthread_exit(long result)
{
struct kthread *kthread = to_kthread(current);
kthread->result = result;
- if (!list_empty(&kthread->hotplug_node)) {
- mutex_lock(&kthreads_hotplug_lock);
- list_del(&kthread->hotplug_node);
- mutex_unlock(&kthreads_hotplug_lock);
+ if (!list_empty(&kthread->affinity_node)) {
+ mutex_lock(&kthread_affinity_lock);
+ list_del(&kthread->affinity_node);
+ mutex_unlock(&kthread_affinity_lock);
if (kthread->preferred_affinity) {
kfree(kthread->preferred_affinity);
@@ -362,17 +362,20 @@ static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpum
{
const struct cpumask *pref;
+ guard(rcu)();
+
if (kthread->preferred_affinity) {
pref = kthread->preferred_affinity;
} else {
- if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE))
- return;
- pref = cpumask_of_node(kthread->node);
+ if (kthread->node == NUMA_NO_NODE)
+ pref = housekeeping_cpumask(HK_TYPE_DOMAIN);
+ else
+ pref = cpumask_of_node(kthread->node);
}
- cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD));
+ cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_DOMAIN));
if (cpumask_empty(cpumask))
- cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+ cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
}
static void kthread_affine_node(void)
@@ -380,32 +383,29 @@ static void kthread_affine_node(void)
struct kthread *kthread = to_kthread(current);
cpumask_var_t affinity;
- WARN_ON_ONCE(kthread_is_per_cpu(current));
+ if (WARN_ON_ONCE(kthread_is_per_cpu(current)))
+ return;
- if (kthread->node == NUMA_NO_NODE) {
- housekeeping_affine(current, HK_TYPE_KTHREAD);
- } else {
- if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
- WARN_ON_ONCE(1);
- return;
- }
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
- mutex_lock(&kthreads_hotplug_lock);
- WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
- list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
- /*
- * The node cpumask is racy when read from kthread() but:
- * - a racing CPU going down will either fail on the subsequent
- * call to set_cpus_allowed_ptr() or be migrated to housekeepers
- * afterwards by the scheduler.
- * - a racing CPU going up will be handled by kthreads_online_cpu()
- */
- kthread_fetch_affinity(kthread, affinity);
- set_cpus_allowed_ptr(current, affinity);
- mutex_unlock(&kthreads_hotplug_lock);
+ mutex_lock(&kthread_affinity_lock);
+ WARN_ON_ONCE(!list_empty(&kthread->affinity_node));
+ list_add_tail(&kthread->affinity_node, &kthread_affinity_list);
+ /*
+ * The node cpumask is racy when read from kthread() but:
+ * - a racing CPU going down will either fail on the subsequent
+ * call to set_cpus_allowed_ptr() or be migrated to housekeepers
+ * afterwards by the scheduler.
+ * - a racing CPU going up will be handled by kthreads_online_cpu()
+ */
+ kthread_fetch_affinity(kthread, affinity);
+ set_cpus_allowed_ptr(current, affinity);
+ mutex_unlock(&kthread_affinity_lock);
- free_cpumask_var(affinity);
- }
+ free_cpumask_var(affinity);
}
static int kthread(void *_create)
@@ -453,6 +453,10 @@ static int kthread(void *_create)
self->started = 1;
+ /*
+ * Apply default node affinity if no call to kthread_bind[_mask]() nor
+ * kthread_affine_preferred() was issued before the first wake-up.
+ */
if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity)
kthread_affine_node();
@@ -820,12 +824,13 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, comm);
ignore_signals(tsk);
- set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
cgroup_init_kthreadd();
+ kthread_affine_node();
+
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
if (list_empty(&kthread_create_list))
@@ -851,6 +856,18 @@ int kthreadd(void *unused)
return 0;
}
+/**
+ * kthread_affine_preferred - Define a kthread's preferred affinity
+ * @p: thread created by kthread_create().
+ * @mask: preferred mask of CPUs (might not be online, must be possible) for @p
+ * to run on.
+ *
+ * Similar to kthread_bind_mask() except that the affinity is not a requirement
+ * but rather a preference that can be constrained by CPU isolation or CPU hotplug.
+ * Must be called before the first wakeup of the kthread.
+ *
+ * Returns 0 if the affinity has been applied.
+ */
int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
{
struct kthread *kthread = to_kthread(p);
@@ -873,16 +890,16 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
goto out;
}
- mutex_lock(&kthreads_hotplug_lock);
+ mutex_lock(&kthread_affinity_lock);
cpumask_copy(kthread->preferred_affinity, mask);
- WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
- list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
+ WARN_ON_ONCE(!list_empty(&kthread->affinity_node));
+ list_add_tail(&kthread->affinity_node, &kthread_affinity_list);
kthread_fetch_affinity(kthread, affinity);
scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
set_cpus_allowed_force(p, affinity);
- mutex_unlock(&kthreads_hotplug_lock);
+ mutex_unlock(&kthread_affinity_lock);
out:
free_cpumask_var(affinity);
@@ -890,22 +907,15 @@ out:
}
EXPORT_SYMBOL_GPL(kthread_affine_preferred);
-/*
- * Re-affine kthreads according to their preferences
- * and the newly online CPU. The CPU down part is handled
- * by select_fallback_rq() which default re-affines to
- * housekeepers from other nodes in case the preferred
- * affinity doesn't apply anymore.
- */
-static int kthreads_online_cpu(unsigned int cpu)
+static int kthreads_update_affinity(bool force)
{
cpumask_var_t affinity;
struct kthread *k;
int ret;
- guard(mutex)(&kthreads_hotplug_lock);
+ guard(mutex)(&kthread_affinity_lock);
- if (list_empty(&kthreads_hotplug))
+ if (list_empty(&kthread_affinity_list))
return 0;
if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
@@ -913,14 +923,29 @@ static int kthreads_online_cpu(unsigned int cpu)
ret = 0;
- list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
+ list_for_each_entry(k, &kthread_affinity_list, affinity_node) {
if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
kthread_is_per_cpu(k->task))) {
ret = -EINVAL;
continue;
}
- kthread_fetch_affinity(k, affinity);
- set_cpus_allowed_ptr(k->task, affinity);
+
+ /*
+ * Unbound kthreads without preferred affinity are already affine
+ * to housekeeping, whether those CPUs are online or not. So no need
+ * to handle newly online CPUs for them. However housekeeping changes
+ * have to be applied.
+ *
+ * But kthreads with a preferred affinity or node are different:
+ * if none of their preferred CPUs are online and part of
+ * housekeeping at the same time, they must be affine to housekeeping.
+ * But as soon as one of their preferred CPU becomes online, they must
+ * be affine to them.
+ */
+ if (force || k->preferred_affinity || k->node != NUMA_NO_NODE) {
+ kthread_fetch_affinity(k, affinity);
+ set_cpus_allowed_ptr(k->task, affinity);
+ }
}
free_cpumask_var(affinity);
@@ -928,6 +953,33 @@ static int kthreads_online_cpu(unsigned int cpu)
return ret;
}
+/**
+ * kthreads_update_housekeeping - Update kthreads affinity on cpuset change
+ *
+ * When cpuset changes a partition type to/from "isolated" or updates related
+ * cpumasks, propagate the housekeeping cpumask change to preferred kthreads
+ * affinity.
+ *
+ * Returns 0 if successful, -ENOMEM if temporary mask couldn't
+ * be allocated or -EINVAL in case of internal error.
+ */
+int kthreads_update_housekeeping(void)
+{
+ return kthreads_update_affinity(true);
+}
+
+/*
+ * Re-affine kthreads according to their preferences
+ * and the newly online CPU. The CPU down part is handled
+ * by select_fallback_rq() which default re-affines to
+ * housekeepers from other nodes in case the preferred
+ * affinity doesn't apply anymore.
+ */
+static int kthreads_online_cpu(unsigned int cpu)
+{
+ return kthreads_update_affinity(false);
+}
+
static int kthreads_init(void)
{
return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online",
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 9917756dae46..1acbad2dbfdf 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -1356,6 +1356,25 @@ void klp_module_going(struct module *mod)
mutex_unlock(&klp_mutex);
}
+void *klp_find_section_by_name(const struct module *mod, const char *name,
+ size_t *sec_size)
+{
+ struct klp_modinfo *info = mod->klp_info;
+
+ for (int i = 1; i < info->hdr.e_shnum; i++) {
+ Elf_Shdr *shdr = &info->sechdrs[i];
+
+ if (!strcmp(info->secstrings + shdr->sh_name, name)) {
+ *sec_size = shdr->sh_size;
+ return (void *)shdr->sh_addr;
+ }
+ }
+
+ *sec_size = 0;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(klp_find_section_by_name);
+
static int __init klp_init(void)
{
klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig
index d2aeaf13c3ac..1a8513f16ef7 100644
--- a/kernel/liveupdate/Kconfig
+++ b/kernel/liveupdate/Kconfig
@@ -54,7 +54,6 @@ config KEXEC_HANDOVER_ENABLE_DEFAULT
config LIVEUPDATE
bool "Live Update Orchestrator"
depends on KEXEC_HANDOVER
- depends on SHMEM
help
Enable the Live Update Orchestrator. Live Update is a mechanism,
typically based on kexec, that allows the kernel to be updated
@@ -73,4 +72,20 @@ config LIVEUPDATE
If unsure, say N.
+config LIVEUPDATE_MEMFD
+ bool "Live update support for memfd"
+ depends on LIVEUPDATE
+ depends on MEMFD_CREATE
+ depends on SHMEM
+ default LIVEUPDATE
+ help
+ Enable live update support for memfd regions. This allows preserving
+ memfd-backed memory across kernel live updates.
+
+ This can be used to back VM memory with memfds, allowing the guest
+ memory to persist, or for other user workloads needing to preserve
+ pages.
+
+ If unsure, say N.
+
endmenu
diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile
index 7cad2eece32d..d2f779cbe279 100644
--- a/kernel/liveupdate/Makefile
+++ b/kernel/liveupdate/Makefile
@@ -3,6 +3,7 @@
luo-y := \
luo_core.o \
luo_file.o \
+ luo_flb.o \
luo_session.o
obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index d4482b6e3cae..fb3a7b67676e 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -15,6 +15,7 @@
#include <linux/count_zeros.h>
#include <linux/kexec.h>
#include <linux/kexec_handover.h>
+#include <linux/kho/abi/kexec_handover.h>
#include <linux/libfdt.h>
#include <linux/list.h>
#include <linux/memblock.h>
@@ -24,7 +25,6 @@
#include <asm/early_ioremap.h>
-#include "kexec_handover_internal.h"
/*
* KHO is tightly coupled with mm init and needs access to some of mm
* internal APIs.
@@ -33,10 +33,7 @@
#include "../kexec_internal.h"
#include "kexec_handover_internal.h"
-#define KHO_FDT_COMPATIBLE "kho-v1"
-#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map"
-#define PROP_SUB_FDT "fdt"
-
+/* The magic token for preserved pages */
#define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */
/*
@@ -219,10 +216,32 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
return 0;
}
+/* For physically contiguous 0-order pages. */
+static void kho_init_pages(struct page *page, unsigned long nr_pages)
+{
+ for (unsigned long i = 0; i < nr_pages; i++)
+ set_page_count(page + i, 1);
+}
+
+static void kho_init_folio(struct page *page, unsigned int order)
+{
+ unsigned long nr_pages = (1 << order);
+
+ /* Head page gets refcount of 1. */
+ set_page_count(page, 1);
+
+ /* For higher order folios, tail pages get a page count of zero. */
+ for (unsigned long i = 1; i < nr_pages; i++)
+ set_page_count(page + i, 0);
+
+ if (order > 0)
+ prep_compound_page(page, order);
+}
+
static struct page *kho_restore_page(phys_addr_t phys, bool is_folio)
{
struct page *page = pfn_to_online_page(PHYS_PFN(phys));
- unsigned int nr_pages, ref_cnt;
+ unsigned long nr_pages;
union kho_page_info info;
if (!page)
@@ -240,20 +259,19 @@ static struct page *kho_restore_page(phys_addr_t phys, bool is_folio)
/* Clear private to make sure later restores on this page error out. */
page->private = 0;
- /* Head page gets refcount of 1. */
- set_page_count(page, 1);
-
- /*
- * For higher order folios, tail pages get a page count of zero.
- * For physically contiguous order-0 pages every pages gets a page
- * count of 1
- */
- ref_cnt = is_folio ? 0 : 1;
- for (unsigned int i = 1; i < nr_pages; i++)
- set_page_count(page + i, ref_cnt);
- if (is_folio && info.order)
- prep_compound_page(page, info.order);
+ if (is_folio)
+ kho_init_folio(page, info.order);
+ else
+ kho_init_pages(page, nr_pages);
+
+ /* Always mark headpage's codetag as empty to avoid accounting mismatch */
+ clear_page_tag_ref(page);
+ if (!is_folio) {
+ /* Also do that for the non-compound tail pages */
+ for (unsigned int i = 1; i < nr_pages; i++)
+ clear_page_tag_ref(page + i);
+ }
adjust_managed_page_count(page, nr_pages);
return page;
@@ -281,9 +299,9 @@ EXPORT_SYMBOL_GPL(kho_restore_folio);
* Restore a contiguous list of order 0 pages that was preserved with
* kho_preserve_pages().
*
- * Return: 0 on success, error code on failure
+ * Return: the first page on success, NULL on failure.
*/
-struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages)
+struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages)
{
const unsigned long start_pfn = PHYS_PFN(phys);
const unsigned long end_pfn = start_pfn + nr_pages;
@@ -378,7 +396,7 @@ static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk)
void *ptr;
u64 phys;
- ptr = fdt_getprop_w(kho_out.fdt, 0, PROP_PRESERVED_MEMORY_MAP, NULL);
+ ptr = fdt_getprop_w(kho_out.fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, NULL);
/* Check and discard previous memory map */
phys = get_unaligned((u64 *)ptr);
@@ -466,7 +484,7 @@ static phys_addr_t __init kho_get_mem_map_phys(const void *fdt)
const void *mem_ptr;
int len;
- mem_ptr = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len);
+ mem_ptr = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len);
if (!mem_ptr || len != sizeof(u64)) {
pr_err("failed to get preserved memory bitmaps\n");
return 0;
@@ -637,11 +655,13 @@ static void __init kho_reserve_scratch(void)
scratch_size_update();
/* FIXME: deal with node hot-plug/remove */
- kho_scratch_cnt = num_online_nodes() + 2;
+ kho_scratch_cnt = nodes_weight(node_states[N_MEMORY]) + 2;
size = kho_scratch_cnt * sizeof(*kho_scratch);
kho_scratch = memblock_alloc(size, PAGE_SIZE);
- if (!kho_scratch)
+ if (!kho_scratch) {
+ pr_err("Failed to reserve scratch array\n");
goto err_disable_kho;
+ }
/*
* reserve scratch area in low memory for lowmem allocations in the
@@ -650,8 +670,10 @@ static void __init kho_reserve_scratch(void)
size = scratch_size_lowmem;
addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0,
ARCH_LOW_ADDRESS_LIMIT);
- if (!addr)
+ if (!addr) {
+ pr_err("Failed to reserve lowmem scratch buffer\n");
goto err_free_scratch_desc;
+ }
kho_scratch[i].addr = addr;
kho_scratch[i].size = size;
@@ -660,20 +682,28 @@ static void __init kho_reserve_scratch(void)
/* reserve large contiguous area for allocations without nid */
size = scratch_size_global;
addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES);
- if (!addr)
+ if (!addr) {
+ pr_err("Failed to reserve global scratch buffer\n");
goto err_free_scratch_areas;
+ }
kho_scratch[i].addr = addr;
kho_scratch[i].size = size;
i++;
- for_each_online_node(nid) {
+ /*
+ * Loop over nodes that have both memory and are online. Skip
+ * memoryless nodes, as we can not allocate scratch areas there.
+ */
+ for_each_node_state(nid, N_MEMORY) {
size = scratch_size_node(nid);
addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES,
0, MEMBLOCK_ALLOC_ACCESSIBLE,
nid, true);
- if (!addr)
+ if (!addr) {
+ pr_err("Failed to reserve nid %d scratch buffer\n", nid);
goto err_free_scratch_areas;
+ }
kho_scratch[i].addr = addr;
kho_scratch[i].size = size;
@@ -727,7 +757,8 @@ int kho_add_subtree(const char *name, void *fdt)
goto out_pack;
}
- err = fdt_setprop(root_fdt, off, PROP_SUB_FDT, &phys, sizeof(phys));
+ err = fdt_setprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME,
+ &phys, sizeof(phys));
if (err < 0)
goto out_pack;
@@ -758,7 +789,7 @@ void kho_remove_subtree(void *fdt)
const u64 *val;
int len;
- val = fdt_getprop(root_fdt, off, PROP_SUB_FDT, &len);
+ val = fdt_getprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, &len);
if (!val || len != sizeof(phys_addr_t))
continue;
@@ -823,7 +854,7 @@ EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
*
* Return: 0 on success, error code on failure
*/
-int kho_preserve_pages(struct page *page, unsigned int nr_pages)
+int kho_preserve_pages(struct page *page, unsigned long nr_pages)
{
struct kho_mem_track *track = &kho_out.track;
const unsigned long start_pfn = page_to_pfn(page);
@@ -867,7 +898,7 @@ EXPORT_SYMBOL_GPL(kho_preserve_pages);
* kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger
* preserved blocks is not supported.
*/
-void kho_unpreserve_pages(struct page *page, unsigned int nr_pages)
+void kho_unpreserve_pages(struct page *page, unsigned long nr_pages)
{
struct kho_mem_track *track = &kho_out.track;
const unsigned long start_pfn = page_to_pfn(page);
@@ -877,21 +908,6 @@ void kho_unpreserve_pages(struct page *page, unsigned int nr_pages)
}
EXPORT_SYMBOL_GPL(kho_unpreserve_pages);
-struct kho_vmalloc_hdr {
- DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
-};
-
-#define KHO_VMALLOC_SIZE \
- ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \
- sizeof(phys_addr_t))
-
-struct kho_vmalloc_chunk {
- struct kho_vmalloc_hdr hdr;
- phys_addr_t phys[KHO_VMALLOC_SIZE];
-};
-
-static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE);
-
/* vmalloc flags KHO supports */
#define KHO_VMALLOC_SUPPORTED_FLAGS (VM_ALLOC | VM_ALLOW_HUGE_VMAP)
@@ -1006,8 +1022,10 @@ int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation)
chunk->phys[idx++] = phys;
if (idx == ARRAY_SIZE(chunk->phys)) {
chunk = new_vmalloc_chunk(chunk);
- if (!chunk)
+ if (!chunk) {
+ err = -ENOMEM;
goto err_free;
+ }
idx = 0;
}
}
@@ -1305,7 +1323,7 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
if (offset < 0)
return -ENOENT;
- val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len);
+ val = fdt_getprop(fdt, offset, KHO_FDT_SUB_TREE_PROP_NAME, &len);
if (!val || len != sizeof(*val))
return -EINVAL;
@@ -1325,7 +1343,7 @@ static __init int kho_out_fdt_setup(void)
err |= fdt_finish_reservemap(root);
err |= fdt_begin_node(root, "");
err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE);
- err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map,
+ err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME, &empty_mem_map,
sizeof(empty_mem_map));
err |= fdt_end_node(root);
err |= fdt_finish(root);
@@ -1441,46 +1459,40 @@ void __init kho_memory_init(void)
void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
phys_addr_t scratch_phys, u64 scratch_len)
{
+ unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch);
struct kho_scratch *scratch = NULL;
phys_addr_t mem_map_phys;
void *fdt = NULL;
- int err = 0;
- unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch);
+ int err;
/* Validate the input FDT */
fdt = early_memremap(fdt_phys, fdt_len);
if (!fdt) {
pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys);
- err = -EFAULT;
- goto out;
+ goto err_report;
}
err = fdt_check_header(fdt);
if (err) {
pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n",
fdt_phys, err);
- err = -EINVAL;
- goto out;
+ goto err_unmap_fdt;
}
err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE);
if (err) {
pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n",
fdt_phys, KHO_FDT_COMPATIBLE, err);
- err = -EINVAL;
- goto out;
+ goto err_unmap_fdt;
}
mem_map_phys = kho_get_mem_map_phys(fdt);
- if (!mem_map_phys) {
- err = -ENOENT;
- goto out;
- }
+ if (!mem_map_phys)
+ goto err_unmap_fdt;
scratch = early_memremap(scratch_phys, scratch_len);
if (!scratch) {
pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n",
scratch_phys, scratch_len);
- err = -EFAULT;
- goto out;
+ goto err_unmap_fdt;
}
/*
@@ -1497,7 +1509,7 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
if (WARN_ON(err)) {
pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe",
&area->addr, &size, ERR_PTR(err));
- goto out;
+ goto err_unmap_scratch;
}
pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size);
}
@@ -1519,13 +1531,14 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
kho_scratch_cnt = scratch_cnt;
pr_info("found kexec handover data.\n");
-out:
- if (fdt)
- early_memunmap(fdt, fdt_len);
- if (scratch)
- early_memunmap(scratch, scratch_len);
- if (err)
- pr_warn("disabling KHO revival: %d\n", err);
+ return;
+
+err_unmap_scratch:
+ early_memunmap(scratch, scratch_len);
+err_unmap_fdt:
+ early_memunmap(fdt, fdt_len);
+err_report:
+ pr_warn("disabling KHO revival\n");
}
/* Helper functions for kexec_file_load */
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
index 944663d99dd9..dda7bb57d421 100644
--- a/kernel/liveupdate/luo_core.c
+++ b/kernel/liveupdate/luo_core.c
@@ -35,8 +35,7 @@
* iommu, interrupts, vfio, participating filesystems, and memory management.
*
* LUO uses Kexec Handover to transfer memory state from the current kernel to
- * the next kernel. For more details see
- * Documentation/core-api/kho/concepts.rst.
+ * the next kernel. For more details see Documentation/core-api/kho/index.rst.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -128,7 +127,9 @@ static int __init luo_early_startup(void)
if (err)
return err;
- return 0;
+ err = luo_flb_setup_incoming(luo_global.fdt_in);
+
+ return err;
}
static int __init liveupdate_early_init(void)
@@ -165,6 +166,7 @@ static int __init luo_fdt_setup(void)
err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE);
err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln));
err |= luo_session_setup_outgoing(fdt_out);
+ err |= luo_flb_setup_outgoing(fdt_out);
err |= fdt_end_node(fdt_out);
err |= fdt_finish(fdt_out);
if (err)
@@ -226,6 +228,8 @@ int liveupdate_reboot(void)
if (err)
return err;
+ luo_flb_serialize();
+
err = kho_finalize();
if (err) {
pr_err("kho_finalize failed %d\n", err);
diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
index a32a777f6df8..4c7df52a6507 100644
--- a/kernel/liveupdate/luo_file.c
+++ b/kernel/liveupdate/luo_file.c
@@ -104,6 +104,7 @@
#include <linux/io.h>
#include <linux/kexec_handover.h>
#include <linux/kho/abi/luo.h>
+#include <linux/list_private.h>
#include <linux/liveupdate.h>
#include <linux/module.h>
#include <linux/sizes.h>
@@ -273,7 +274,7 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
goto err_fput;
err = -ENOENT;
- luo_list_for_each_private(fh, &luo_file_handler_list, list) {
+ list_private_for_each_entry(fh, &luo_file_handler_list, list) {
if (fh->ops->can_preserve(fh, file)) {
err = 0;
break;
@@ -284,10 +285,14 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
if (err)
goto err_free_files_mem;
+ err = luo_flb_file_preserve(fh);
+ if (err)
+ goto err_free_files_mem;
+
luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL);
if (!luo_file) {
err = -ENOMEM;
- goto err_free_files_mem;
+ goto err_flb_unpreserve;
}
luo_file->file = file;
@@ -311,6 +316,8 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
err_kfree:
kfree(luo_file);
+err_flb_unpreserve:
+ luo_flb_file_unpreserve(fh);
err_free_files_mem:
luo_free_files_mem(file_set);
err_fput:
@@ -352,6 +359,7 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set)
args.serialized_data = luo_file->serialized_data;
args.private_data = luo_file->private_data;
luo_file->fh->ops->unpreserve(&args);
+ luo_flb_file_unpreserve(luo_file->fh);
list_del(&luo_file->list);
file_set->count--;
@@ -402,8 +410,6 @@ static void luo_file_unfreeze_one(struct luo_file_set *file_set,
luo_file->fh->ops->unfreeze(&args);
}
-
- luo_file->serialized_data = 0;
}
static void __luo_file_unfreeze(struct luo_file_set *file_set,
@@ -629,6 +635,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set,
args.retrieved = luo_file->retrieved;
luo_file->fh->ops->finish(&args);
+ luo_flb_file_finish(luo_file->fh);
}
/**
@@ -760,7 +767,7 @@ int luo_file_deserialize(struct luo_file_set *file_set,
bool handler_found = false;
struct luo_file *luo_file;
- luo_list_for_each_private(fh, &luo_file_handler_list, list) {
+ list_private_for_each_entry(fh, &luo_file_handler_list, list) {
if (!strcmp(fh->compatible, file_ser[i].compatible)) {
handler_found = true;
break;
@@ -835,7 +842,7 @@ int liveupdate_register_file_handler(struct liveupdate_file_handler *fh)
return -EBUSY;
/* Check for duplicate compatible strings */
- luo_list_for_each_private(fh_iter, &luo_file_handler_list, list) {
+ list_private_for_each_entry(fh_iter, &luo_file_handler_list, list) {
if (!strcmp(fh_iter->compatible, fh->compatible)) {
pr_err("File handler registration failed: Compatible string '%s' already registered.\n",
fh->compatible);
@@ -850,10 +857,13 @@ int liveupdate_register_file_handler(struct liveupdate_file_handler *fh)
goto err_resume;
}
+ INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, flb_list));
INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, list));
list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list);
luo_session_resume();
+ liveupdate_test_register(fh);
+
return 0;
err_resume:
@@ -870,23 +880,38 @@ err_resume:
*
* It ensures safe removal by checking that:
* No live update session is currently in progress.
+ * No FLB registered with this file handler.
*
* If the unregistration fails, the internal test state is reverted.
*
* Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live
- * update is in progress, can't quiesce live update.
+ * update is in progress, can't quiesce live update or FLB is registred with
+ * this file handler.
*/
int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
{
+ int err = -EBUSY;
+
if (!liveupdate_enabled())
return -EOPNOTSUPP;
+ liveupdate_test_unregister(fh);
+
if (!luo_session_quiesce())
- return -EBUSY;
+ goto err_register;
+
+ if (!list_empty(&ACCESS_PRIVATE(fh, flb_list)))
+ goto err_resume;
list_del(&ACCESS_PRIVATE(fh, list));
module_put(fh->ops->owner);
luo_session_resume();
return 0;
+
+err_resume:
+ luo_session_resume();
+err_register:
+ liveupdate_test_register(fh);
+ return err;
}
diff --git a/kernel/liveupdate/luo_flb.c b/kernel/liveupdate/luo_flb.c
new file mode 100644
index 000000000000..4c437de5c0b0
--- /dev/null
+++ b/kernel/liveupdate/luo_flb.c
@@ -0,0 +1,654 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: LUO File Lifecycle Bound Global Data
+ *
+ * File-Lifecycle-Bound (FLB) objects provide a mechanism for managing global
+ * state that is shared across multiple live-updatable files. The lifecycle of
+ * this shared state is tied to the preservation of the files that depend on it.
+ *
+ * An FLB represents a global resource, such as the IOMMU core state, that is
+ * required by multiple file descriptors (e.g., all VFIO fds).
+ *
+ * The preservation of the FLB's state is triggered when the *first* file
+ * depending on it is preserved. The cleanup of this state (unpreserve or
+ * finish) is triggered when the *last* file depending on it is unpreserved or
+ * finished.
+ *
+ * Handler Dependency: A file handler declares its dependency on one or more
+ * FLBs by registering them via liveupdate_register_flb().
+ *
+ * Callback Model: Each FLB is defined by a set of operations
+ * (&struct liveupdate_flb_ops) that LUO invokes at key points:
+ *
+ * - .preserve(): Called for the first file. Saves global state.
+ * - .unpreserve(): Called for the last file (if aborted pre-reboot).
+ * - .retrieve(): Called on-demand in the new kernel to restore the state.
+ * - .finish(): Called for the last file in the new kernel for cleanup.
+ *
+ * This reference-counted approach ensures that shared state is saved exactly
+ * once and restored exactly once, regardless of how many files depend on it,
+ * and that its lifecycle is correctly managed across the kexec transition.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cleanup.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
+#include <linux/libfdt.h>
+#include <linux/list_private.h>
+#include <linux/liveupdate.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/unaligned.h>
+#include "luo_internal.h"
+
+#define LUO_FLB_PGCNT 1ul
+#define LUO_FLB_MAX (((LUO_FLB_PGCNT << PAGE_SHIFT) - \
+ sizeof(struct luo_flb_header_ser)) / sizeof(struct luo_flb_ser))
+
+struct luo_flb_header {
+ struct luo_flb_header_ser *header_ser;
+ struct luo_flb_ser *ser;
+ bool active;
+};
+
+struct luo_flb_global {
+ struct luo_flb_header incoming;
+ struct luo_flb_header outgoing;
+ struct list_head list;
+ long count;
+};
+
+static struct luo_flb_global luo_flb_global = {
+ .list = LIST_HEAD_INIT(luo_flb_global.list),
+};
+
+/*
+ * struct luo_flb_link - Links an FLB definition to a file handler's internal
+ * list of dependencies.
+ * @flb: A pointer to the registered &struct liveupdate_flb definition.
+ * @list: The list_head for linking.
+ */
+struct luo_flb_link {
+ struct liveupdate_flb *flb;
+ struct list_head list;
+};
+
+/* luo_flb_get_private - Access private field, and if needed initialize it. */
+static struct luo_flb_private *luo_flb_get_private(struct liveupdate_flb *flb)
+{
+ struct luo_flb_private *private = &ACCESS_PRIVATE(flb, private);
+
+ if (!private->initialized) {
+ mutex_init(&private->incoming.lock);
+ mutex_init(&private->outgoing.lock);
+ INIT_LIST_HEAD(&private->list);
+ private->users = 0;
+ private->initialized = true;
+ }
+
+ return private;
+}
+
+static int luo_flb_file_preserve_one(struct liveupdate_flb *flb)
+{
+ struct luo_flb_private *private = luo_flb_get_private(flb);
+
+ scoped_guard(mutex, &private->outgoing.lock) {
+ if (!private->outgoing.count) {
+ struct liveupdate_flb_op_args args = {0};
+ int err;
+
+ args.flb = flb;
+ err = flb->ops->preserve(&args);
+ if (err)
+ return err;
+ private->outgoing.data = args.data;
+ private->outgoing.obj = args.obj;
+ }
+ private->outgoing.count++;
+ }
+
+ return 0;
+}
+
+static void luo_flb_file_unpreserve_one(struct liveupdate_flb *flb)
+{
+ struct luo_flb_private *private = luo_flb_get_private(flb);
+
+ scoped_guard(mutex, &private->outgoing.lock) {
+ private->outgoing.count--;
+ if (!private->outgoing.count) {
+ struct liveupdate_flb_op_args args = {0};
+
+ args.flb = flb;
+ args.data = private->outgoing.data;
+ args.obj = private->outgoing.obj;
+
+ if (flb->ops->unpreserve)
+ flb->ops->unpreserve(&args);
+
+ private->outgoing.data = 0;
+ private->outgoing.obj = NULL;
+ }
+ }
+}
+
+static int luo_flb_retrieve_one(struct liveupdate_flb *flb)
+{
+ struct luo_flb_private *private = luo_flb_get_private(flb);
+ struct luo_flb_header *fh = &luo_flb_global.incoming;
+ struct liveupdate_flb_op_args args = {0};
+ bool found = false;
+ int err;
+
+ guard(mutex)(&private->incoming.lock);
+
+ if (private->incoming.finished)
+ return -ENODATA;
+
+ if (private->incoming.retrieved)
+ return 0;
+
+ if (!fh->active)
+ return -ENODATA;
+
+ for (int i = 0; i < fh->header_ser->count; i++) {
+ if (!strcmp(fh->ser[i].name, flb->compatible)) {
+ private->incoming.data = fh->ser[i].data;
+ private->incoming.count = fh->ser[i].count;
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ return -ENOENT;
+
+ args.flb = flb;
+ args.data = private->incoming.data;
+
+ err = flb->ops->retrieve(&args);
+ if (err)
+ return err;
+
+ private->incoming.obj = args.obj;
+ private->incoming.retrieved = true;
+
+ return 0;
+}
+
+static void luo_flb_file_finish_one(struct liveupdate_flb *flb)
+{
+ struct luo_flb_private *private = luo_flb_get_private(flb);
+ u64 count;
+
+ scoped_guard(mutex, &private->incoming.lock)
+ count = --private->incoming.count;
+
+ if (!count) {
+ struct liveupdate_flb_op_args args = {0};
+
+ if (!private->incoming.retrieved) {
+ int err = luo_flb_retrieve_one(flb);
+
+ if (WARN_ON(err))
+ return;
+ }
+
+ scoped_guard(mutex, &private->incoming.lock) {
+ args.flb = flb;
+ args.obj = private->incoming.obj;
+ flb->ops->finish(&args);
+
+ private->incoming.data = 0;
+ private->incoming.obj = NULL;
+ private->incoming.finished = true;
+ }
+ }
+}
+
+/**
+ * luo_flb_file_preserve - Notifies FLBs that a file is about to be preserved.
+ * @fh: The file handler for the preserved file.
+ *
+ * This function iterates through all FLBs associated with the given file
+ * handler. It increments the reference count for each FLB. If the count becomes
+ * 1, it triggers the FLB's .preserve() callback to save the global state.
+ *
+ * This operation is atomic. If any FLB's .preserve() op fails, it will roll
+ * back by calling .unpreserve() on any FLBs that were successfully preserved
+ * during this call.
+ *
+ * Context: Called from luo_preserve_file()
+ * Return: 0 on success, or a negative errno on failure.
+ */
+int luo_flb_file_preserve(struct liveupdate_file_handler *fh)
+{
+ struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list);
+ struct luo_flb_link *iter;
+ int err = 0;
+
+ list_for_each_entry(iter, flb_list, list) {
+ err = luo_flb_file_preserve_one(iter->flb);
+ if (err)
+ goto exit_err;
+ }
+
+ return 0;
+
+exit_err:
+ list_for_each_entry_continue_reverse(iter, flb_list, list)
+ luo_flb_file_unpreserve_one(iter->flb);
+
+ return err;
+}
+
+/**
+ * luo_flb_file_unpreserve - Notifies FLBs that a dependent file was unpreserved.
+ * @fh: The file handler for the unpreserved file.
+ *
+ * This function iterates through all FLBs associated with the given file
+ * handler, in reverse order of registration. It decrements the reference count
+ * for each FLB. If the count becomes 0, it triggers the FLB's .unpreserve()
+ * callback to clean up the global state.
+ *
+ * Context: Called when a preserved file is being cleaned up before reboot
+ * (e.g., from luo_file_unpreserve_files()).
+ */
+void luo_flb_file_unpreserve(struct liveupdate_file_handler *fh)
+{
+ struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list);
+ struct luo_flb_link *iter;
+
+ list_for_each_entry_reverse(iter, flb_list, list)
+ luo_flb_file_unpreserve_one(iter->flb);
+}
+
+/**
+ * luo_flb_file_finish - Notifies FLBs that a dependent file has been finished.
+ * @fh: The file handler for the finished file.
+ *
+ * This function iterates through all FLBs associated with the given file
+ * handler, in reverse order of registration. It decrements the incoming
+ * reference count for each FLB. If the count becomes 0, it triggers the FLB's
+ * .finish() callback for final cleanup in the new kernel.
+ *
+ * Context: Called from luo_file_finish() for each file being finished.
+ */
+void luo_flb_file_finish(struct liveupdate_file_handler *fh)
+{
+ struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list);
+ struct luo_flb_link *iter;
+
+ list_for_each_entry_reverse(iter, flb_list, list)
+ luo_flb_file_finish_one(iter->flb);
+}
+
+/**
+ * liveupdate_register_flb - Associate an FLB with a file handler and register it globally.
+ * @fh: The file handler that will now depend on the FLB.
+ * @flb: The File-Lifecycle-Bound object to associate.
+ *
+ * Establishes a dependency, informing the LUO core that whenever a file of
+ * type @fh is preserved, the state of @flb must also be managed.
+ *
+ * On the first registration of a given @flb object, it is added to a global
+ * registry. This function checks for duplicate registrations, both for a
+ * specific handler and globally, and ensures the total number of unique
+ * FLBs does not exceed the system limit.
+ *
+ * Context: Typically called from a subsystem's module init function after
+ * both the handler and the FLB have been defined and initialized.
+ * Return: 0 on success. Returns a negative errno on failure:
+ * -EINVAL if arguments are NULL or not initialized.
+ * -ENOMEM on memory allocation failure.
+ * -EEXIST if this FLB is already registered with this handler.
+ * -ENOSPC if the maximum number of global FLBs has been reached.
+ * -EOPNOTSUPP if live update is disabled or not configured.
+ */
+int liveupdate_register_flb(struct liveupdate_file_handler *fh,
+ struct liveupdate_flb *flb)
+{
+ struct luo_flb_private *private = luo_flb_get_private(flb);
+ struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list);
+ struct luo_flb_link *link __free(kfree) = NULL;
+ struct liveupdate_flb *gflb;
+ struct luo_flb_link *iter;
+ int err;
+
+ if (!liveupdate_enabled())
+ return -EOPNOTSUPP;
+
+ if (WARN_ON(!flb->ops->preserve || !flb->ops->unpreserve ||
+ !flb->ops->retrieve || !flb->ops->finish)) {
+ return -EINVAL;
+ }
+
+ /*
+ * File handler must already be registered, as it initializes the
+ * flb_list
+ */
+ if (WARN_ON(list_empty(&ACCESS_PRIVATE(fh, list))))
+ return -EINVAL;
+
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ if (!link)
+ return -ENOMEM;
+
+ /*
+ * Ensure the system is quiescent (no active sessions).
+ * This acts as a global lock for registration: no other thread can
+ * be in this section, and no sessions can be creating/using FDs.
+ */
+ if (!luo_session_quiesce())
+ return -EBUSY;
+
+ /* Check that this FLB is not already linked to this file handler */
+ err = -EEXIST;
+ list_for_each_entry(iter, flb_list, list) {
+ if (iter->flb == flb)
+ goto err_resume;
+ }
+
+ /*
+ * If this FLB is not linked to global list it's the first time the FLB
+ * is registered
+ */
+ if (!private->users) {
+ if (WARN_ON(!list_empty(&private->list))) {
+ err = -EINVAL;
+ goto err_resume;
+ }
+
+ if (luo_flb_global.count == LUO_FLB_MAX) {
+ err = -ENOSPC;
+ goto err_resume;
+ }
+
+ /* Check that compatible string is unique in global list */
+ list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) {
+ if (!strcmp(gflb->compatible, flb->compatible))
+ goto err_resume;
+ }
+
+ if (!try_module_get(flb->ops->owner)) {
+ err = -EAGAIN;
+ goto err_resume;
+ }
+
+ list_add_tail(&private->list, &luo_flb_global.list);
+ luo_flb_global.count++;
+ }
+
+ /* Finally, link the FLB to the file handler */
+ private->users++;
+ link->flb = flb;
+ list_add_tail(&no_free_ptr(link)->list, flb_list);
+ luo_session_resume();
+
+ return 0;
+
+err_resume:
+ luo_session_resume();
+ return err;
+}
+
+/**
+ * liveupdate_unregister_flb - Remove an FLB dependency from a file handler.
+ * @fh: The file handler that is currently depending on the FLB.
+ * @flb: The File-Lifecycle-Bound object to remove.
+ *
+ * Removes the association between the specified file handler and the FLB
+ * previously established by liveupdate_register_flb().
+ *
+ * This function manages the global lifecycle of the FLB. It decrements the
+ * FLB's usage count. If this was the last file handler referencing this FLB,
+ * the FLB is removed from the global registry and the reference to its
+ * owner module (acquired during registration) is released.
+ *
+ * Context: This function ensures the session is quiesced (no active FDs
+ * being created) during the update. It is typically called from a
+ * subsystem's module exit function.
+ * Return: 0 on success.
+ * -EOPNOTSUPP if live update is disabled.
+ * -EBUSY if the live update session is active and cannot be quiesced.
+ * -ENOENT if the FLB was not found in the file handler's list.
+ */
+int liveupdate_unregister_flb(struct liveupdate_file_handler *fh,
+ struct liveupdate_flb *flb)
+{
+ struct luo_flb_private *private = luo_flb_get_private(flb);
+ struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list);
+ struct luo_flb_link *iter;
+ int err = -ENOENT;
+
+ if (!liveupdate_enabled())
+ return -EOPNOTSUPP;
+
+ /*
+ * Ensure the system is quiescent (no active sessions).
+ * This acts as a global lock for unregistration.
+ */
+ if (!luo_session_quiesce())
+ return -EBUSY;
+
+ /* Find and remove the link from the file handler's list */
+ list_for_each_entry(iter, flb_list, list) {
+ if (iter->flb == flb) {
+ list_del(&iter->list);
+ kfree(iter);
+ err = 0;
+ break;
+ }
+ }
+
+ if (err)
+ goto err_resume;
+
+ private->users--;
+ /*
+ * If this is the last file-handler with which we are registred, remove
+ * from the global list, and relese module reference.
+ */
+ if (!private->users) {
+ list_del_init(&private->list);
+ luo_flb_global.count--;
+ module_put(flb->ops->owner);
+ }
+
+ luo_session_resume();
+
+ return 0;
+
+err_resume:
+ luo_session_resume();
+ return err;
+}
+
+/**
+ * liveupdate_flb_get_incoming - Retrieve the incoming FLB object.
+ * @flb: The FLB definition.
+ * @objp: Output parameter; will be populated with the live shared object.
+ *
+ * Returns a pointer to its shared live object for the incoming (post-reboot)
+ * path.
+ *
+ * If this is the first time the object is requested in the new kernel, this
+ * function will trigger the FLB's .retrieve() callback to reconstruct the
+ * object from its preserved state. Subsequent calls will return the same
+ * cached object.
+ *
+ * Return: 0 on success, or a negative errno on failure. -ENODATA means no
+ * incoming FLB data, -ENOENT means specific flb not found in the incoming
+ * data, and -EOPNOTSUPP when live update is disabled or not configured.
+ */
+int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp)
+{
+ struct luo_flb_private *private = luo_flb_get_private(flb);
+
+ if (!liveupdate_enabled())
+ return -EOPNOTSUPP;
+
+ if (!private->incoming.obj) {
+ int err = luo_flb_retrieve_one(flb);
+
+ if (err)
+ return err;
+ }
+
+ guard(mutex)(&private->incoming.lock);
+ *objp = private->incoming.obj;
+
+ return 0;
+}
+
+/**
+ * liveupdate_flb_get_outgoing - Retrieve the outgoing FLB object.
+ * @flb: The FLB definition.
+ * @objp: Output parameter; will be populated with the live shared object.
+ *
+ * Returns a pointer to its shared live object for the outgoing (pre-reboot)
+ * path.
+ *
+ * This function assumes the object has already been created by the FLB's
+ * .preserve() callback, which is triggered when the first dependent file
+ * is preserved.
+ *
+ * Return: 0 on success, or a negative errno on failure.
+ */
+int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp)
+{
+ struct luo_flb_private *private = luo_flb_get_private(flb);
+
+ if (!liveupdate_enabled())
+ return -EOPNOTSUPP;
+
+ guard(mutex)(&private->outgoing.lock);
+ *objp = private->outgoing.obj;
+
+ return 0;
+}
+
+int __init luo_flb_setup_outgoing(void *fdt_out)
+{
+ struct luo_flb_header_ser *header_ser;
+ u64 header_ser_pa;
+ int err;
+
+ header_ser = kho_alloc_preserve(LUO_FLB_PGCNT << PAGE_SHIFT);
+ if (IS_ERR(header_ser))
+ return PTR_ERR(header_ser);
+
+ header_ser_pa = virt_to_phys(header_ser);
+
+ err = fdt_begin_node(fdt_out, LUO_FDT_FLB_NODE_NAME);
+ err |= fdt_property_string(fdt_out, "compatible",
+ LUO_FDT_FLB_COMPATIBLE);
+ err |= fdt_property(fdt_out, LUO_FDT_FLB_HEADER, &header_ser_pa,
+ sizeof(header_ser_pa));
+ err |= fdt_end_node(fdt_out);
+
+ if (err)
+ goto err_unpreserve;
+
+ header_ser->pgcnt = LUO_FLB_PGCNT;
+ luo_flb_global.outgoing.header_ser = header_ser;
+ luo_flb_global.outgoing.ser = (void *)(header_ser + 1);
+ luo_flb_global.outgoing.active = true;
+
+ return 0;
+
+err_unpreserve:
+ kho_unpreserve_free(header_ser);
+
+ return err;
+}
+
+int __init luo_flb_setup_incoming(void *fdt_in)
+{
+ struct luo_flb_header_ser *header_ser;
+ int err, header_size, offset;
+ const void *ptr;
+ u64 header_ser_pa;
+
+ offset = fdt_subnode_offset(fdt_in, 0, LUO_FDT_FLB_NODE_NAME);
+ if (offset < 0) {
+ pr_err("Unable to get FLB node [%s]\n", LUO_FDT_FLB_NODE_NAME);
+
+ return -ENOENT;
+ }
+
+ err = fdt_node_check_compatible(fdt_in, offset,
+ LUO_FDT_FLB_COMPATIBLE);
+ if (err) {
+ pr_err("FLB node is incompatible with '%s' [%d]\n",
+ LUO_FDT_FLB_COMPATIBLE, err);
+
+ return -EINVAL;
+ }
+
+ header_size = 0;
+ ptr = fdt_getprop(fdt_in, offset, LUO_FDT_FLB_HEADER, &header_size);
+ if (!ptr || header_size != sizeof(u64)) {
+ pr_err("Unable to get FLB header property '%s' [%d]\n",
+ LUO_FDT_FLB_HEADER, header_size);
+
+ return -EINVAL;
+ }
+
+ header_ser_pa = get_unaligned((u64 *)ptr);
+ header_ser = phys_to_virt(header_ser_pa);
+
+ luo_flb_global.incoming.header_ser = header_ser;
+ luo_flb_global.incoming.ser = (void *)(header_ser + 1);
+ luo_flb_global.incoming.active = true;
+
+ return 0;
+}
+
+/**
+ * luo_flb_serialize - Serializes all active FLB objects for KHO.
+ *
+ * This function is called from the reboot path. It iterates through all
+ * registered File-Lifecycle-Bound (FLB) objects. For each FLB that has been
+ * preserved (i.e., its reference count is greater than zero), it writes its
+ * metadata into the memory region designated for Kexec Handover.
+ *
+ * The serialized data includes the FLB's compatibility string, its opaque
+ * data handle, and the final reference count. This allows the new kernel to
+ * find the appropriate handler and reconstruct the FLB's state.
+ *
+ * Context: Called from liveupdate_reboot() just before kho_finalize().
+ */
+void luo_flb_serialize(void)
+{
+ struct luo_flb_header *fh = &luo_flb_global.outgoing;
+ struct liveupdate_flb *gflb;
+ int i = 0;
+
+ list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) {
+ struct luo_flb_private *private = luo_flb_get_private(gflb);
+
+ if (private->outgoing.count > 0) {
+ strscpy(fh->ser[i].name, gflb->compatible,
+ sizeof(fh->ser[i].name));
+ fh->ser[i].data = private->outgoing.data;
+ fh->ser[i].count = private->outgoing.count;
+ i++;
+ }
+ }
+
+ fh->header_ser->count = i;
+}
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
index c8973b543d1d..8083d8739b09 100644
--- a/kernel/liveupdate/luo_internal.h
+++ b/kernel/liveupdate/luo_internal.h
@@ -40,13 +40,6 @@ static inline int luo_ucmd_respond(struct luo_ucmd *ucmd,
*/
#define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__)
-/* Mimics list_for_each_entry() but for private list head entries */
-#define luo_list_for_each_private(pos, head, member) \
- for (struct list_head *__iter = (head)->next; \
- __iter != (head) && \
- ({ pos = container_of(__iter, typeof(*(pos)), member); 1; }); \
- __iter = __iter->next)
-
/**
* struct luo_file_set - A set of files that belong to the same sessions.
* @files_list: An ordered list of files associated with this session, it is
@@ -107,4 +100,19 @@ int luo_file_deserialize(struct luo_file_set *file_set,
void luo_file_set_init(struct luo_file_set *file_set);
void luo_file_set_destroy(struct luo_file_set *file_set);
+int luo_flb_file_preserve(struct liveupdate_file_handler *fh);
+void luo_flb_file_unpreserve(struct liveupdate_file_handler *fh);
+void luo_flb_file_finish(struct liveupdate_file_handler *fh);
+int __init luo_flb_setup_outgoing(void *fdt);
+int __init luo_flb_setup_incoming(void *fdt);
+void luo_flb_serialize(void);
+
+#ifdef CONFIG_LIVEUPDATE_TEST
+void liveupdate_test_register(struct liveupdate_file_handler *fh);
+void liveupdate_test_unregister(struct liveupdate_file_handler *fh);
+#else
+static inline void liveupdate_test_register(struct liveupdate_file_handler *fh) { }
+static inline void liveupdate_test_unregister(struct liveupdate_file_handler *fh) { }
+#endif
+
#endif /* _LINUX_LUO_INTERNAL_H */
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index bcb1b9fea588..79b5e45f8d4c 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -13,7 +13,8 @@
#include <linux/slab.h>
#include <linux/ww_mutex.h>
-static DEFINE_WD_CLASS(ww_class);
+static DEFINE_WD_CLASS(wd_class);
+static DEFINE_WW_CLASS(ww_class);
struct workqueue_struct *wq;
#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
@@ -54,16 +55,16 @@ static void test_mutex_work(struct work_struct *work)
ww_mutex_unlock(&mtx->mutex);
}
-static int __test_mutex(unsigned int flags)
+static int __test_mutex(struct ww_class *class, unsigned int flags)
{
#define TIMEOUT (HZ / 16)
struct test_mutex mtx;
struct ww_acquire_ctx ctx;
int ret;
- ww_mutex_init(&mtx.mutex, &ww_class);
+ ww_mutex_init(&mtx.mutex, class);
if (flags & TEST_MTX_CTX)
- ww_acquire_init(&ctx, &ww_class);
+ ww_acquire_init(&ctx, class);
INIT_WORK_ONSTACK(&mtx.work, test_mutex_work);
init_completion(&mtx.ready);
@@ -71,7 +72,7 @@ static int __test_mutex(unsigned int flags)
init_completion(&mtx.done);
mtx.flags = flags;
- schedule_work(&mtx.work);
+ queue_work(wq, &mtx.work);
wait_for_completion(&mtx.ready);
ww_mutex_lock(&mtx.mutex, (flags & TEST_MTX_CTX) ? &ctx : NULL);
@@ -106,13 +107,13 @@ static int __test_mutex(unsigned int flags)
#undef TIMEOUT
}
-static int test_mutex(void)
+static int test_mutex(struct ww_class *class)
{
int ret;
int i;
for (i = 0; i < __TEST_MTX_LAST; i++) {
- ret = __test_mutex(i);
+ ret = __test_mutex(class, i);
if (ret)
return ret;
}
@@ -120,15 +121,15 @@ static int test_mutex(void)
return 0;
}
-static int test_aa(bool trylock)
+static int test_aa(struct ww_class *class, bool trylock)
{
struct ww_mutex mutex;
struct ww_acquire_ctx ctx;
int ret;
const char *from = trylock ? "trylock" : "lock";
- ww_mutex_init(&mutex, &ww_class);
- ww_acquire_init(&ctx, &ww_class);
+ ww_mutex_init(&mutex, class);
+ ww_acquire_init(&ctx, class);
if (!trylock) {
ret = ww_mutex_lock(&mutex, &ctx);
@@ -177,6 +178,7 @@ out:
struct test_abba {
struct work_struct work;
+ struct ww_class *class;
struct ww_mutex a_mutex;
struct ww_mutex b_mutex;
struct completion a_ready;
@@ -191,7 +193,7 @@ static void test_abba_work(struct work_struct *work)
struct ww_acquire_ctx ctx;
int err;
- ww_acquire_init_noinject(&ctx, &ww_class);
+ ww_acquire_init_noinject(&ctx, abba->class);
if (!abba->trylock)
ww_mutex_lock(&abba->b_mutex, &ctx);
else
@@ -217,23 +219,24 @@ static void test_abba_work(struct work_struct *work)
abba->result = err;
}
-static int test_abba(bool trylock, bool resolve)
+static int test_abba(struct ww_class *class, bool trylock, bool resolve)
{
struct test_abba abba;
struct ww_acquire_ctx ctx;
int err, ret;
- ww_mutex_init(&abba.a_mutex, &ww_class);
- ww_mutex_init(&abba.b_mutex, &ww_class);
+ ww_mutex_init(&abba.a_mutex, class);
+ ww_mutex_init(&abba.b_mutex, class);
INIT_WORK_ONSTACK(&abba.work, test_abba_work);
init_completion(&abba.a_ready);
init_completion(&abba.b_ready);
+ abba.class = class;
abba.trylock = trylock;
abba.resolve = resolve;
- schedule_work(&abba.work);
+ queue_work(wq, &abba.work);
- ww_acquire_init_noinject(&ctx, &ww_class);
+ ww_acquire_init_noinject(&ctx, class);
if (!trylock)
ww_mutex_lock(&abba.a_mutex, &ctx);
else
@@ -278,6 +281,7 @@ static int test_abba(bool trylock, bool resolve)
struct test_cycle {
struct work_struct work;
+ struct ww_class *class;
struct ww_mutex a_mutex;
struct ww_mutex *b_mutex;
struct completion *a_signal;
@@ -291,7 +295,7 @@ static void test_cycle_work(struct work_struct *work)
struct ww_acquire_ctx ctx;
int err, erra = 0;
- ww_acquire_init_noinject(&ctx, &ww_class);
+ ww_acquire_init_noinject(&ctx, cycle->class);
ww_mutex_lock(&cycle->a_mutex, &ctx);
complete(cycle->a_signal);
@@ -314,7 +318,7 @@ static void test_cycle_work(struct work_struct *work)
cycle->result = err ?: erra;
}
-static int __test_cycle(unsigned int nthreads)
+static int __test_cycle(struct ww_class *class, unsigned int nthreads)
{
struct test_cycle *cycles;
unsigned int n, last = nthreads - 1;
@@ -327,7 +331,8 @@ static int __test_cycle(unsigned int nthreads)
for (n = 0; n < nthreads; n++) {
struct test_cycle *cycle = &cycles[n];
- ww_mutex_init(&cycle->a_mutex, &ww_class);
+ cycle->class = class;
+ ww_mutex_init(&cycle->a_mutex, class);
if (n == last)
cycle->b_mutex = &cycles[0].a_mutex;
else
@@ -367,13 +372,13 @@ static int __test_cycle(unsigned int nthreads)
return ret;
}
-static int test_cycle(unsigned int ncpus)
+static int test_cycle(struct ww_class *class, unsigned int ncpus)
{
unsigned int n;
int ret;
for (n = 2; n <= ncpus + 1; n++) {
- ret = __test_cycle(n);
+ ret = __test_cycle(class, n);
if (ret)
return ret;
}
@@ -384,6 +389,7 @@ static int test_cycle(unsigned int ncpus)
struct stress {
struct work_struct work;
struct ww_mutex *locks;
+ struct ww_class *class;
unsigned long timeout;
int nlocks;
};
@@ -443,7 +449,7 @@ static void stress_inorder_work(struct work_struct *work)
int contended = -1;
int n, err;
- ww_acquire_init(&ctx, &ww_class);
+ ww_acquire_init(&ctx, stress->class);
retry:
err = 0;
for (n = 0; n < nlocks; n++) {
@@ -511,7 +517,7 @@ static void stress_reorder_work(struct work_struct *work)
order = NULL;
do {
- ww_acquire_init(&ctx, &ww_class);
+ ww_acquire_init(&ctx, stress->class);
list_for_each_entry(ll, &locks, link) {
err = ww_mutex_lock(ll->lock, &ctx);
@@ -570,7 +576,7 @@ static void stress_one_work(struct work_struct *work)
#define STRESS_ONE BIT(2)
#define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE)
-static int stress(int nlocks, int nthreads, unsigned int flags)
+static int stress(struct ww_class *class, int nlocks, int nthreads, unsigned int flags)
{
struct ww_mutex *locks;
struct stress *stress_array;
@@ -588,7 +594,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
}
for (n = 0; n < nlocks; n++)
- ww_mutex_init(&locks[n], &ww_class);
+ ww_mutex_init(&locks[n], class);
count = 0;
for (n = 0; nthreads; n++) {
@@ -617,6 +623,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
stress = &stress_array[count++];
INIT_WORK(&stress->work, fn);
+ stress->class = class;
stress->locks = locks;
stress->nlocks = nlocks;
stress->timeout = jiffies + 2*HZ;
@@ -635,59 +642,131 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
return 0;
}
-static int __init test_ww_mutex_init(void)
+static int run_tests(struct ww_class *class)
{
int ncpus = num_online_cpus();
int ret, i;
- printk(KERN_INFO "Beginning ww mutex selftests\n");
-
- prandom_seed_state(&rng, get_random_u64());
-
- wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0);
- if (!wq)
- return -ENOMEM;
-
- ret = test_mutex();
+ ret = test_mutex(class);
if (ret)
return ret;
- ret = test_aa(false);
+ ret = test_aa(class, false);
if (ret)
return ret;
- ret = test_aa(true);
+ ret = test_aa(class, true);
if (ret)
return ret;
for (i = 0; i < 4; i++) {
- ret = test_abba(i & 1, i & 2);
+ ret = test_abba(class, i & 1, i & 2);
if (ret)
return ret;
}
- ret = test_cycle(ncpus);
+ ret = test_cycle(class, ncpus);
+ if (ret)
+ return ret;
+
+ ret = stress(class, 16, 2 * ncpus, STRESS_INORDER);
+ if (ret)
+ return ret;
+
+ ret = stress(class, 16, 2 * ncpus, STRESS_REORDER);
if (ret)
return ret;
- ret = stress(16, 2*ncpus, STRESS_INORDER);
+ ret = stress(class, 2046, hweight32(STRESS_ALL) * ncpus, STRESS_ALL);
if (ret)
return ret;
- ret = stress(16, 2*ncpus, STRESS_REORDER);
+ return 0;
+}
+
+static int run_test_classes(void)
+{
+ int ret;
+
+ pr_info("Beginning ww (wound) mutex selftests\n");
+
+ ret = run_tests(&ww_class);
if (ret)
return ret;
- ret = stress(2046, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
+ pr_info("Beginning ww (die) mutex selftests\n");
+ ret = run_tests(&wd_class);
if (ret)
return ret;
- printk(KERN_INFO "All ww mutex selftests passed\n");
+ pr_info("All ww mutex selftests passed\n");
return 0;
}
+static DEFINE_MUTEX(run_lock);
+
+static ssize_t run_tests_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (!mutex_trylock(&run_lock)) {
+ pr_err("Test already running\n");
+ return count;
+ }
+
+ run_test_classes();
+ mutex_unlock(&run_lock);
+
+ return count;
+}
+
+static struct kobj_attribute run_tests_attribute =
+ __ATTR(run_tests, 0664, NULL, run_tests_store);
+
+static struct attribute *attrs[] = {
+ &run_tests_attribute.attr,
+ NULL, /* need to NULL terminate the list of attributes */
+};
+
+static struct attribute_group attr_group = {
+ .attrs = attrs,
+};
+
+static struct kobject *test_ww_mutex_kobj;
+
+static int __init test_ww_mutex_init(void)
+{
+ int ret;
+
+ prandom_seed_state(&rng, get_random_u64());
+
+ wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0);
+ if (!wq)
+ return -ENOMEM;
+
+ test_ww_mutex_kobj = kobject_create_and_add("test_ww_mutex", kernel_kobj);
+ if (!test_ww_mutex_kobj) {
+ destroy_workqueue(wq);
+ return -ENOMEM;
+ }
+
+ /* Create the files associated with this kobject */
+ ret = sysfs_create_group(test_ww_mutex_kobj, &attr_group);
+ if (ret) {
+ kobject_put(test_ww_mutex_kobj);
+ destroy_workqueue(wq);
+ return ret;
+ }
+
+ mutex_lock(&run_lock);
+ ret = run_test_classes();
+ mutex_unlock(&run_lock);
+
+ return ret;
+}
+
static void __exit test_ww_mutex_exit(void)
{
+ kobject_put(test_ww_mutex_kobj);
destroy_workqueue(wq);
}
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 2a1beebf1d37..be74917802ad 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -299,10 +299,6 @@ choice
possible to load a signed module containing the algorithm to check
the signature on that module.
-config MODULE_SIG_SHA1
- bool "SHA-1"
- select CRYPTO_SHA1
-
config MODULE_SIG_SHA256
bool "SHA-256"
select CRYPTO_SHA256
@@ -332,7 +328,6 @@ endchoice
config MODULE_SIG_HASH
string
depends on MODULE_SIG || IMA_APPRAISE_MODSIG
- default "sha1" if MODULE_SIG_SHA1
default "sha256" if MODULE_SIG_SHA256
default "sha384" if MODULE_SIG_SHA384
default "sha512" if MODULE_SIG_SHA512
diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
index 474e68f0f063..36f52a232a12 100644
--- a/kernel/module/decompress.c
+++ b/kernel/module/decompress.c
@@ -17,16 +17,16 @@
static int module_extend_max_pages(struct load_info *info, unsigned int extent)
{
struct page **new_pages;
+ unsigned int new_max = info->max_pages + extent;
- new_pages = kvmalloc_array(info->max_pages + extent,
- sizeof(info->pages), GFP_KERNEL);
+ new_pages = kvrealloc(info->pages,
+ size_mul(new_max, sizeof(*info->pages)),
+ GFP_KERNEL);
if (!new_pages)
return -ENOMEM;
- memcpy(new_pages, info->pages, info->max_pages * sizeof(info->pages));
- kvfree(info->pages);
info->pages = new_pages;
- info->max_pages += extent;
+ info->max_pages = new_max;
return 0;
}
diff --git a/kernel/module/dups.c b/kernel/module/dups.c
index bd2149fbe117..0b633f2edda6 100644
--- a/kernel/module/dups.c
+++ b/kernel/module/dups.c
@@ -113,7 +113,7 @@ static void kmod_dup_request_complete(struct work_struct *work)
* let this linger forever as this is just a boot optimization for
* possible abuses of vmalloc() incurred by finit_module() thrashing.
*/
- queue_delayed_work(system_wq, &kmod_req->delete_work, 60 * HZ);
+ queue_delayed_work(system_dfl_wq, &kmod_req->delete_work, 60 * HZ);
}
bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret)
@@ -240,7 +240,7 @@ void kmod_dup_request_announce(char *module_name, int ret)
* There is no rush. But we also don't want to hold the
* caller up forever or introduce any boot delays.
*/
- queue_work(system_wq, &kmod_req->complete_work);
+ queue_work(system_dfl_wq, &kmod_req->complete_work);
out:
mutex_unlock(&kmod_dup_mutex);
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index 00a60796327c..0fc11e45df9b 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -334,13 +334,8 @@ int module_address_lookup(unsigned long addr,
if (mod) {
if (modname)
*modname = mod->name;
- if (modbuildid) {
-#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
- *modbuildid = mod->build_id;
-#else
- *modbuildid = NULL;
-#endif
- }
+ if (modbuildid)
+ *modbuildid = module_buildid(mod);
sym = find_kallsyms_symbol(mod, addr, size, offset);
diff --git a/kernel/padata.c b/kernel/padata.c
index aa66d91e20f9..db7c75787a2b 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -819,7 +819,7 @@ static void __padata_free(struct padata_instance *pinst)
#define kobj2pinst(_kobj) \
container_of(_kobj, struct padata_instance, kobj)
#define attr2pentry(_attr) \
- container_of(_attr, struct padata_sysfs_entry, attr)
+ container_of_const(_attr, struct padata_sysfs_entry, attr)
static void padata_sysfs_release(struct kobject *kobj)
{
@@ -829,13 +829,13 @@ static void padata_sysfs_release(struct kobject *kobj)
struct padata_sysfs_entry {
struct attribute attr;
- ssize_t (*show)(struct padata_instance *, struct attribute *, char *);
- ssize_t (*store)(struct padata_instance *, struct attribute *,
+ ssize_t (*show)(struct padata_instance *, const struct attribute *, char *);
+ ssize_t (*store)(struct padata_instance *, const struct attribute *,
const char *, size_t);
};
static ssize_t show_cpumask(struct padata_instance *pinst,
- struct attribute *attr, char *buf)
+ const struct attribute *attr, char *buf)
{
struct cpumask *cpumask;
ssize_t len;
@@ -853,7 +853,7 @@ static ssize_t show_cpumask(struct padata_instance *pinst,
}
static ssize_t store_cpumask(struct padata_instance *pinst,
- struct attribute *attr,
+ const struct attribute *attr,
const char *buf, size_t count)
{
cpumask_var_t new_cpumask;
@@ -880,10 +880,10 @@ out:
}
#define PADATA_ATTR_RW(_name, _show_name, _store_name) \
- static struct padata_sysfs_entry _name##_attr = \
+ static const struct padata_sysfs_entry _name##_attr = \
__ATTR(_name, 0644, _show_name, _store_name)
-#define PADATA_ATTR_RO(_name, _show_name) \
- static struct padata_sysfs_entry _name##_attr = \
+#define PADATA_ATTR_RO(_name, _show_name) \
+ static const struct padata_sysfs_entry _name##_attr = \
__ATTR(_name, 0400, _show_name, NULL)
PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask);
@@ -894,7 +894,7 @@ PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask);
* serial_cpumask [RW] - cpumask for serial workers
* parallel_cpumask [RW] - cpumask for parallel workers
*/
-static struct attribute *padata_default_attrs[] = {
+static const struct attribute *const padata_default_attrs[] = {
&serial_cpumask_attr.attr,
&parallel_cpumask_attr.attr,
NULL,
@@ -904,8 +904,8 @@ ATTRIBUTE_GROUPS(padata_default);
static ssize_t padata_sysfs_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
+ const struct padata_sysfs_entry *pentry;
struct padata_instance *pinst;
- struct padata_sysfs_entry *pentry;
ssize_t ret = -EIO;
pinst = kobj2pinst(kobj);
@@ -919,8 +919,8 @@ static ssize_t padata_sysfs_show(struct kobject *kobj,
static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t count)
{
+ const struct padata_sysfs_entry *pentry;
struct padata_instance *pinst;
- struct padata_sysfs_entry *pentry;
ssize_t ret = -EIO;
pinst = kobj2pinst(kobj);
diff --git a/kernel/panic.c b/kernel/panic.c
index 0d52210a9e2b..c78600212b6c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -42,6 +42,7 @@
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
+#define PANIC_MSG_BUFSZ 1024
#ifdef CONFIG_SMP
/*
@@ -74,6 +75,8 @@ EXPORT_SYMBOL_GPL(panic_timeout);
unsigned long panic_print;
+static int panic_force_cpu = -1;
+
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
@@ -131,7 +134,8 @@ static int proc_taint(const struct ctl_table *table, int write,
static int sysctl_panic_print_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- panic_print_deprecated();
+ if (write)
+ panic_print_deprecated();
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
@@ -299,6 +303,150 @@ void __weak crash_smp_send_stop(void)
}
atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
+atomic_t panic_redirect_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
+
+#if defined(CONFIG_SMP) && defined(CONFIG_CRASH_DUMP)
+static char *panic_force_buf;
+
+static int __init panic_force_cpu_setup(char *str)
+{
+ int cpu;
+
+ if (!str)
+ return -EINVAL;
+
+ if (kstrtoint(str, 0, &cpu) || cpu < 0 || cpu >= nr_cpu_ids) {
+ pr_warn("panic_force_cpu: invalid value '%s'\n", str);
+ return -EINVAL;
+ }
+
+ panic_force_cpu = cpu;
+ return 0;
+}
+early_param("panic_force_cpu", panic_force_cpu_setup);
+
+static int __init panic_force_cpu_late_init(void)
+{
+ if (panic_force_cpu < 0)
+ return 0;
+
+ panic_force_buf = kmalloc(PANIC_MSG_BUFSZ, GFP_KERNEL);
+
+ return 0;
+}
+late_initcall(panic_force_cpu_late_init);
+
+static void do_panic_on_target_cpu(void *info)
+{
+ panic("%s", (char *)info);
+}
+
+/**
+ * panic_smp_redirect_cpu - Redirect panic to target CPU
+ * @target_cpu: CPU that should handle the panic
+ * @msg: formatted panic message
+ *
+ * Default implementation uses IPI. Architectures with NMI support
+ * can override this for more reliable delivery.
+ *
+ * Return: 0 on success, negative errno on failure
+ */
+int __weak panic_smp_redirect_cpu(int target_cpu, void *msg)
+{
+ static call_single_data_t panic_csd;
+
+ panic_csd.func = do_panic_on_target_cpu;
+ panic_csd.info = msg;
+
+ return smp_call_function_single_async(target_cpu, &panic_csd);
+}
+
+/**
+ * panic_try_force_cpu - Redirect panic to a specific CPU for crash kernel
+ * @fmt: panic message format string
+ * @args: arguments for format string
+ *
+ * Some platforms require panic handling to occur on a specific CPU
+ * for the crash kernel to function correctly. This function redirects
+ * panic handling to the CPU specified via the panic_force_cpu= boot parameter.
+ *
+ * Returns false if panic should proceed on current CPU.
+ * Returns true if panic was redirected.
+ */
+__printf(1, 0)
+static bool panic_try_force_cpu(const char *fmt, va_list args)
+{
+ int this_cpu = raw_smp_processor_id();
+ int old_cpu = PANIC_CPU_INVALID;
+ const char *msg;
+
+ /* Feature not enabled via boot parameter */
+ if (panic_force_cpu < 0)
+ return false;
+
+ /* Already on target CPU - proceed normally */
+ if (this_cpu == panic_force_cpu)
+ return false;
+
+ /* Target CPU is offline, can't redirect */
+ if (!cpu_online(panic_force_cpu)) {
+ pr_warn("panic: target CPU %d is offline, continuing on CPU %d\n",
+ panic_force_cpu, this_cpu);
+ return false;
+ }
+
+ /* Another panic already in progress */
+ if (panic_in_progress())
+ return false;
+
+ /*
+ * Only one CPU can do the redirect. Use atomic cmpxchg to ensure
+ * we don't race with another CPU also trying to redirect.
+ */
+ if (!atomic_try_cmpxchg(&panic_redirect_cpu, &old_cpu, this_cpu))
+ return false;
+
+ /*
+ * Use dynamically allocated buffer if available, otherwise
+ * fall back to static message for early boot panics or allocation failure.
+ */
+ if (panic_force_buf) {
+ vsnprintf(panic_force_buf, PANIC_MSG_BUFSZ, fmt, args);
+ msg = panic_force_buf;
+ } else {
+ msg = "Redirected panic (buffer unavailable)";
+ }
+
+ console_verbose();
+ bust_spinlocks(1);
+
+ pr_emerg("panic: Redirecting from CPU %d to CPU %d for crash kernel.\n",
+ this_cpu, panic_force_cpu);
+
+ /* Dump original CPU before redirecting */
+ if (!test_taint(TAINT_DIE) &&
+ oops_in_progress <= 1 &&
+ IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) {
+ dump_stack();
+ }
+
+ if (panic_smp_redirect_cpu(panic_force_cpu, (void *)msg) != 0) {
+ atomic_set(&panic_redirect_cpu, PANIC_CPU_INVALID);
+ pr_warn("panic: failed to redirect to CPU %d, continuing on CPU %d\n",
+ panic_force_cpu, this_cpu);
+ return false;
+ }
+
+ /* IPI/NMI sent, this CPU should stop */
+ return true;
+}
+#else
+__printf(1, 0)
+static inline bool panic_try_force_cpu(const char *fmt, va_list args)
+{
+ return false;
+}
+#endif /* CONFIG_SMP && CONFIG_CRASH_DUMP */
bool panic_try_start(void)
{
@@ -427,7 +575,7 @@ static void panic_other_cpus_shutdown(bool crash_kexec)
*/
void vpanic(const char *fmt, va_list args)
{
- static char buf[1024];
+ static char buf[PANIC_MSG_BUFSZ];
long i, i_next = 0, len;
int state = 0;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
@@ -451,6 +599,15 @@ void vpanic(const char *fmt, va_list args)
local_irq_disable();
preempt_disable_notrace();
+ /* Redirect panic to target CPU if configured via panic_force_cpu=. */
+ if (panic_try_force_cpu(fmt, args)) {
+ /*
+ * Mark ourselves offline so panic_other_cpus_shutdown() won't wait
+ * for us on architectures that check num_online_cpus().
+ */
+ set_cpu_online(smp_processor_id(), false);
+ panic_smp_self_stop();
+ }
/*
* It's possible to come here directly from a panic-assertion and
* not have preempt disabled. Some functions called from here want
@@ -483,7 +640,11 @@ void vpanic(const char *fmt, va_list args)
/*
* Avoid nested stack-dumping if a panic occurs during oops processing
*/
- if (test_taint(TAINT_DIE) || oops_in_progress > 1) {
+ if (atomic_read(&panic_redirect_cpu) != PANIC_CPU_INVALID &&
+ panic_force_cpu == raw_smp_processor_id()) {
+ pr_emerg("panic: Redirected from CPU %d, skipping stack dump.\n",
+ atomic_read(&panic_redirect_cpu));
+ } else if (test_taint(TAINT_DIE) || oops_in_progress > 1) {
panic_this_cpu_backtrace_printed = true;
} else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) {
dump_stack();
@@ -1014,7 +1175,6 @@ static int panic_print_set(const char *val, const struct kernel_param *kp)
static int panic_print_get(char *val, const struct kernel_param *kp)
{
- panic_print_deprecated();
return param_get_ulong(val, kp);
}
diff --git a/kernel/params.c b/kernel/params.c
index b96cfd693c99..7c2242f64bf0 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -596,12 +596,6 @@ static ssize_t param_attr_store(const struct module_attribute *mattr,
}
#endif
-#ifdef CONFIG_MODULES
-#define __modinit
-#else
-#define __modinit __init
-#endif
-
#ifdef CONFIG_SYSFS
void kernel_param_lock(struct module *mod)
{
@@ -626,9 +620,9 @@ EXPORT_SYMBOL(kernel_param_unlock);
* create file in sysfs. Returns an error on out of memory. Always cleans up
* if there's an error.
*/
-static __modinit int add_sysfs_param(struct module_kobject *mk,
- const struct kernel_param *kp,
- const char *name)
+static __init_or_module int add_sysfs_param(struct module_kobject *mk,
+ const struct kernel_param *kp,
+ const char *name)
{
struct module_param_attrs *new_mp;
struct attribute **new_attrs;
@@ -761,7 +755,8 @@ void destroy_params(const struct kernel_param *params, unsigned num)
params[i].ops->free(params[i].arg);
}
-struct module_kobject __modinit * lookup_or_create_module_kobject(const char *name)
+struct module_kobject * __init_or_module
+lookup_or_create_module_kobject(const char *name)
{
struct module_kobject *mk;
struct kobject *kobj;
diff --git a/kernel/pid.c b/kernel/pid.c
index a31771bc89c1..f45ae56db7da 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -159,58 +159,86 @@ void free_pids(struct pid **pids)
free_pid(pids[tmp]);
}
-struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
- size_t set_tid_size)
+struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
+ size_t arg_set_tid_size)
{
+ int set_tid[MAX_PID_NS_LEVEL + 1] = {};
+ int pid_max[MAX_PID_NS_LEVEL + 1] = {};
struct pid *pid;
enum pid_type type;
int i, nr;
struct pid_namespace *tmp;
struct upid *upid;
int retval = -ENOMEM;
+ bool retried_preload;
/*
- * set_tid_size contains the size of the set_tid array. Starting at
+ * arg_set_tid_size contains the size of the arg_set_tid array. Starting at
* the most nested currently active PID namespace it tells alloc_pid()
* which PID to set for a process in that most nested PID namespace
- * up to set_tid_size PID namespaces. It does not have to set the PID
- * for a process in all nested PID namespaces but set_tid_size must
+ * up to arg_set_tid_size PID namespaces. It does not have to set the PID
+ * for a process in all nested PID namespaces but arg_set_tid_size must
* never be greater than the current ns->level + 1.
*/
- if (set_tid_size > ns->level + 1)
+ if (arg_set_tid_size > ns->level + 1)
return ERR_PTR(-EINVAL);
+ /*
+ * Prep before we take locks:
+ *
+ * 1. allocate and fill in pid struct
+ */
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
return ERR_PTR(retval);
- tmp = ns;
+ get_pid_ns(ns);
pid->level = ns->level;
+ refcount_set(&pid->count, 1);
+ spin_lock_init(&pid->lock);
+ for (type = 0; type < PIDTYPE_MAX; ++type)
+ INIT_HLIST_HEAD(&pid->tasks[type]);
+ init_waitqueue_head(&pid->wait_pidfd);
+ INIT_HLIST_HEAD(&pid->inodes);
- for (i = ns->level; i >= 0; i--) {
- int tid = 0;
- int pid_max = READ_ONCE(tmp->pid_max);
+ /*
+ * 2. perm check checkpoint_restore_ns_capable()
+ *
+ * This stores found pid_max to make sure the used value is the same should
+ * later code need it.
+ */
+ for (tmp = ns, i = ns->level; i >= 0; i--) {
+ pid_max[ns->level - i] = READ_ONCE(tmp->pid_max);
- if (set_tid_size) {
- tid = set_tid[ns->level - i];
+ if (arg_set_tid_size) {
+ int tid = set_tid[ns->level - i] = arg_set_tid[ns->level - i];
retval = -EINVAL;
- if (tid < 1 || tid >= pid_max)
- goto out_free;
+ if (tid < 1 || tid >= pid_max[ns->level - i])
+ goto out_abort;
/*
* Also fail if a PID != 1 is requested and
* no PID 1 exists.
*/
if (tid != 1 && !tmp->child_reaper)
- goto out_free;
+ goto out_abort;
retval = -EPERM;
if (!checkpoint_restore_ns_capable(tmp->user_ns))
- goto out_free;
- set_tid_size--;
+ goto out_abort;
+ arg_set_tid_size--;
}
- idr_preload(GFP_KERNEL);
- spin_lock(&pidmap_lock);
+ tmp = tmp->parent;
+ }
+
+ /*
+ * Prep is done, id allocation goes here:
+ */
+ retried_preload = false;
+ idr_preload(GFP_KERNEL);
+ spin_lock(&pidmap_lock);
+ for (tmp = ns, i = ns->level; i >= 0;) {
+ int tid = set_tid[ns->level - i];
if (tid) {
nr = idr_alloc(&tmp->idr, NULL, tid,
@@ -220,6 +248,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
* alreay in use. Return EEXIST in that case.
*/
if (nr == -ENOSPC)
+
nr = -EEXIST;
} else {
int pid_min = 1;
@@ -235,19 +264,42 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
* a partially initialized PID (see below).
*/
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
- pid_max, GFP_ATOMIC);
+ pid_max[ns->level - i], GFP_ATOMIC);
+ if (nr == -ENOSPC)
+ nr = -EAGAIN;
}
- spin_unlock(&pidmap_lock);
- idr_preload_end();
- if (nr < 0) {
- retval = (nr == -ENOSPC) ? -EAGAIN : nr;
+ if (unlikely(nr < 0)) {
+ /*
+ * Preload more memory if idr_alloc{,cyclic} failed with -ENOMEM.
+ *
+ * The IDR API only allows us to preload memory for one call, while we may end
+ * up doing several under pidmap_lock with GFP_ATOMIC. The situation may be
+ * salvageable with GFP_KERNEL. But make sure to not loop indefinitely if preload
+ * did not help (the routine unfortunately returns void, so we have no idea
+ * if it got anywhere).
+ *
+ * The lock can be safely dropped and picked up as historically pid allocation
+ * for different namespaces was *not* atomic -- we try to hold on to it the
+ * entire time only for performance reasons.
+ */
+ if (nr == -ENOMEM && !retried_preload) {
+ spin_unlock(&pidmap_lock);
+ idr_preload_end();
+ retried_preload = true;
+ idr_preload(GFP_KERNEL);
+ spin_lock(&pidmap_lock);
+ continue;
+ }
+ retval = nr;
goto out_free;
}
pid->numbers[i].nr = nr;
pid->numbers[i].ns = tmp;
tmp = tmp->parent;
+ i--;
+ retried_preload = false;
}
/*
@@ -257,25 +309,15 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
* is what we have exposed to userspace for a long time and it is
* documented behavior for pid namespaces. So we can't easily
* change it even if there were an error code better suited.
+ *
+ * This can't be done earlier because we need to preserve other
+ * error conditions.
*/
retval = -ENOMEM;
-
- get_pid_ns(ns);
- refcount_set(&pid->count, 1);
- spin_lock_init(&pid->lock);
- for (type = 0; type < PIDTYPE_MAX; ++type)
- INIT_HLIST_HEAD(&pid->tasks[type]);
-
- init_waitqueue_head(&pid->wait_pidfd);
- INIT_HLIST_HEAD(&pid->inodes);
-
- upid = pid->numbers + ns->level;
- idr_preload(GFP_KERNEL);
- spin_lock(&pidmap_lock);
- if (!(ns->pid_allocated & PIDNS_ADDING))
- goto out_unlock;
+ if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
+ goto out_free;
pidfs_add_pid(pid);
- for ( ; upid >= pid->numbers; --upid) {
+ for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr);
upid->ns->pid_allocated++;
@@ -286,13 +328,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
return pid;
-out_unlock:
- spin_unlock(&pidmap_lock);
- idr_preload_end();
- put_pid_ns(ns);
-
out_free:
- spin_lock(&pidmap_lock);
while (++i <= ns->level) {
upid = pid->numbers + i;
idr_remove(&upid->ns->idr, upid->nr);
@@ -303,7 +339,10 @@ out_free:
idr_set_cursor(&ns->idr, 0);
spin_unlock(&pidmap_lock);
+ idr_preload_end();
+out_abort:
+ put_pid_ns(ns);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 03b2c5495c77..5f8c9e12eaec 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -1125,7 +1125,7 @@ EXPORT_SYMBOL_GPL(pm_wq);
static int __init pm_start_workqueues(void)
{
- pm_wq = alloc_workqueue("pm", WQ_FREEZABLE | WQ_UNBOUND, 0);
+ pm_wq = alloc_workqueue("pm", WQ_UNBOUND, 0);
if (!pm_wq)
return -ENOMEM;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8050e5182835..c4eb284b8e72 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -174,10 +174,10 @@ sector_t alloc_swapdev_block(int swap)
* Allocate a swap page and register that it has been allocated, so that
* it can be freed in case of an error.
*/
- offset = swp_offset(get_swap_page_of_type(swap));
+ offset = swp_offset(swap_alloc_hibernation_slot(swap));
if (offset) {
if (swsusp_extents_insert(offset))
- swap_free(swp_entry(swap, offset));
+ swap_free_hibernation_slot(swp_entry(swap, offset));
else
return swapdev_block(swap, offset);
}
@@ -186,6 +186,7 @@ sector_t alloc_swapdev_block(int swap)
void free_all_swap_pages(int swap)
{
+ unsigned long offset;
struct rb_node *node;
/*
@@ -197,8 +198,9 @@ void free_all_swap_pages(int swap)
ext = rb_entry(node, struct swsusp_extent, node);
rb_erase(node, &swsusp_extents);
- swap_free_nr(swp_entry(swap, ext->start),
- ext->end - ext->start + 1);
+
+ for (offset = ext->start; offset <= ext->end; offset++)
+ swap_free_hibernation_slot(swp_entry(swap, offset));
kfree(ext);
}
@@ -902,8 +904,8 @@ out_clean:
for (thr = 0; thr < nr_threads; thr++) {
if (data[thr].thr)
kthread_stop(data[thr].thr);
- if (data[thr].cr)
- acomp_request_free(data[thr].cr);
+
+ acomp_request_free(data[thr].cr);
if (!IS_ERR_OR_NULL(data[thr].cc))
crypto_free_acomp(data[thr].cc);
@@ -1502,8 +1504,8 @@ out_clean:
for (thr = 0; thr < nr_threads; thr++) {
if (data[thr].thr)
kthread_stop(data[thr].thr);
- if (data[thr].cr)
- acomp_request_free(data[thr].cr);
+
+ acomp_request_free(data[thr].cr);
if (!IS_ERR_OR_NULL(data[thr].cc))
crypto_free_acomp(data[thr].cc);
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 5f5f626f4279..5fdea5682756 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -281,12 +281,20 @@ struct printk_buffers {
* nothing to output and this record should be skipped.
* @seq: The sequence number of the record used for @pbufs->outbuf.
* @dropped: The number of dropped records from reading @seq.
+ * @cpu: CPU on which the message was generated.
+ * @pid: PID of the task that generated the message
+ * @comm: Name of the task that generated the message.
*/
struct printk_message {
struct printk_buffers *pbufs;
unsigned int outbuf_len;
u64 seq;
unsigned long dropped;
+#ifdef CONFIG_PRINTK_EXECUTION_CTX
+ int cpu;
+ pid_t pid;
+ char comm[TASK_COMM_LEN];
+#endif
};
bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index 32fc12e53675..d558b18505cd 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -946,6 +946,20 @@ void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt)
}
EXPORT_SYMBOL_GPL(nbcon_reacquire_nobuf);
+#ifdef CONFIG_PRINTK_EXECUTION_CTX
+static void wctxt_load_execution_ctx(struct nbcon_write_context *wctxt,
+ struct printk_message *pmsg)
+{
+ wctxt->cpu = pmsg->cpu;
+ wctxt->pid = pmsg->pid;
+ memcpy(wctxt->comm, pmsg->comm, sizeof(wctxt->comm));
+ static_assert(sizeof(wctxt->comm) == sizeof(pmsg->comm));
+}
+#else
+static void wctxt_load_execution_ctx(struct nbcon_write_context *wctxt,
+ struct printk_message *pmsg) {}
+#endif
+
/**
* nbcon_emit_next_record - Emit a record in the acquired context
* @wctxt: The write context that will be handed to the write function
@@ -1048,6 +1062,8 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_a
/* Initialize the write context for driver callbacks. */
nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len);
+ wctxt_load_execution_ctx(wctxt, &pmsg);
+
if (use_atomic)
con->write_atomic(con, wctxt);
else
@@ -1758,9 +1774,12 @@ bool nbcon_alloc(struct console *con)
/* Synchronize the kthread start. */
lockdep_assert_console_list_lock_held();
- /* The write_thread() callback is mandatory. */
- if (WARN_ON(!con->write_thread))
+ /* Check for mandatory nbcon callbacks. */
+ if (WARN_ON(!con->write_thread ||
+ !con->device_lock ||
+ !con->device_unlock)) {
return false;
+ }
rcuwait_init(&con->rcuwait);
init_irq_work(&con->irq_work, nbcon_irq_work);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1d765ad242b8..4ddeb55e1207 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -245,6 +245,7 @@ int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,
* For console list or console->flags updates
*/
void console_list_lock(void)
+ __acquires(&console_mutex)
{
/*
* In unregister_console() and console_force_preferred_locked(),
@@ -269,6 +270,7 @@ EXPORT_SYMBOL(console_list_lock);
* Counterpart to console_list_lock()
*/
void console_list_unlock(void)
+ __releases(&console_mutex)
{
mutex_unlock(&console_mutex);
}
@@ -2131,11 +2133,39 @@ static inline void printk_delay(int level)
}
}
+#define CALLER_ID_MASK 0x80000000
+
static inline u32 printk_caller_id(void)
{
return in_task() ? task_pid_nr(current) :
- 0x80000000 + smp_processor_id();
+ CALLER_ID_MASK + smp_processor_id();
+}
+
+#ifdef CONFIG_PRINTK_EXECUTION_CTX
+/* Store the opposite info than caller_id. */
+static u32 printk_caller_id2(void)
+{
+ return !in_task() ? task_pid_nr(current) :
+ CALLER_ID_MASK + smp_processor_id();
+}
+
+static pid_t printk_info_get_pid(const struct printk_info *info)
+{
+ u32 caller_id = info->caller_id;
+ u32 caller_id2 = info->caller_id2;
+
+ return caller_id & CALLER_ID_MASK ? caller_id2 : caller_id;
+}
+
+static int printk_info_get_cpu(const struct printk_info *info)
+{
+ u32 caller_id = info->caller_id;
+ u32 caller_id2 = info->caller_id2;
+
+ return ((caller_id & CALLER_ID_MASK ?
+ caller_id : caller_id2) & ~CALLER_ID_MASK);
}
+#endif
/**
* printk_parse_prefix - Parse level and control flags.
@@ -2213,6 +2243,28 @@ static u16 printk_sprint(char *text, u16 size, int facility,
return text_len;
}
+#ifdef CONFIG_PRINTK_EXECUTION_CTX
+static void printk_store_execution_ctx(struct printk_info *info)
+{
+ info->caller_id2 = printk_caller_id2();
+ get_task_comm(info->comm, current);
+}
+
+static void pmsg_load_execution_ctx(struct printk_message *pmsg,
+ const struct printk_info *info)
+{
+ pmsg->cpu = printk_info_get_cpu(info);
+ pmsg->pid = printk_info_get_pid(info);
+ memcpy(pmsg->comm, info->comm, sizeof(pmsg->comm));
+ static_assert(sizeof(pmsg->comm) == sizeof(info->comm));
+}
+#else
+static void printk_store_execution_ctx(struct printk_info *info) {}
+
+static void pmsg_load_execution_ctx(struct printk_message *pmsg,
+ const struct printk_info *info) {}
+#endif
+
__printf(4, 0)
int vprintk_store(int facility, int level,
const struct dev_printk_info *dev_info,
@@ -2320,6 +2372,7 @@ int vprintk_store(int facility, int level,
r.info->caller_id = caller_id;
if (dev_info)
memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));
+ printk_store_execution_ctx(r.info);
/* A message without a trailing newline can be continued. */
if (!(flags & LOG_NEWLINE))
@@ -3002,6 +3055,7 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
pmsg->seq = r.info->seq;
pmsg->dropped = r.info->seq - seq;
force_con = r.info->flags & LOG_FORCE_CON;
+ pmsg_load_execution_ctx(pmsg, r.info);
/*
* Skip records that are not forced to be printed on consoles and that
diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h
index 4ef81349d9fb..1651b53ece34 100644
--- a/kernel/printk/printk_ringbuffer.h
+++ b/kernel/printk/printk_ringbuffer.h
@@ -23,6 +23,11 @@ struct printk_info {
u8 flags:5; /* internal record flags */
u8 level:3; /* syslog level */
u32 caller_id; /* thread id or processor id */
+#ifdef CONFIG_PRINTK_EXECUTION_CTX
+ u32 caller_id2; /* caller_id complement */
+ /* name of the task that generated the message */
+ char comm[TASK_COMM_LEN];
+#endif
struct dev_printk_info dev_info;
};
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 4d9b21f69eaa..762299291e09 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -82,7 +82,7 @@ config NEED_SRCU_NMI_SAFE
def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU
config TASKS_RCU_GENERIC
- def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU
+ def_bool TASKS_RCU || TASKS_RUDE_RCU
help
This option enables generic infrastructure code supporting
task-based RCU implementations. Not for manual selection.
@@ -142,6 +142,29 @@ config TASKS_TRACE_RCU
default n
select IRQ_WORK
+config TASKS_TRACE_RCU_NO_MB
+ bool "Override RCU Tasks Trace inclusion of read-side memory barriers"
+ depends on RCU_EXPERT && TASKS_TRACE_RCU
+ default ARCH_WANTS_NO_INSTR
+ help
+ This option prevents the use of read-side memory barriers in
+ rcu_read_lock_tasks_trace() and rcu_read_unlock_tasks_trace()
+ even in kernels built with CONFIG_ARCH_WANTS_NO_INSTR=n, that is,
+ in kernels that do not have noinstr set up in entry/exit code.
+ By setting this option, you are promising to carefully review
+ use of ftrace, BPF, and friends to ensure that no tracing
+ operation is attached to a function that runs in that portion
+ of the entry/exit code that RCU does not watch, that is,
+ where rcu_is_watching() returns false. Alternatively, you
+ might choose to never remove traces except by rebooting.
+
+ Those wishing to disable read-side memory barriers for an entire
+ architecture can select this Kconfig option, hence the polarity.
+
+ Say Y here if you need speed and will review use of tracing.
+ Say N here for certain esoteric testing of RCU itself.
+ Take the default if you are unsure.
+
config RCU_STALL_COMMON
def_bool TREE_RCU
help
@@ -313,24 +336,6 @@ config RCU_NOCB_CPU_CB_BOOST
Say Y here if you want to set RT priority for offloading kthreads.
Say N here if you are building a !PREEMPT_RT kernel and are unsure.
-config TASKS_TRACE_RCU_READ_MB
- bool "Tasks Trace RCU readers use memory barriers in user and idle"
- depends on RCU_EXPERT && TASKS_TRACE_RCU
- default PREEMPT_RT || NR_CPUS < 8
- help
- Use this option to further reduce the number of IPIs sent
- to CPUs executing in userspace or idle during tasks trace
- RCU grace periods. Given that a reasonable setting of
- the rcupdate.rcu_task_ipi_delay kernel boot parameter
- eliminates such IPIs for many workloads, proper setting
- of this Kconfig option is important mostly for aggressive
- real-time installations and for battery-powered devices,
- hence the default chosen above.
-
- Say Y here if you hate IPIs.
- Say N here if you hate read-side memory barriers.
- Take the default if you are unsure.
-
config RCU_LAZY
bool "RCU callback lazy invocation functionality"
depends on RCU_NOCB_CPU
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 9cf01832a6c3..dc5d614b372c 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -544,10 +544,6 @@ struct task_struct *get_rcu_tasks_rude_gp_kthread(void);
void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq);
#endif // # ifdef CONFIG_TASKS_RUDE_RCU
-#ifdef CONFIG_TASKS_TRACE_RCU
-void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq);
-#endif
-
#ifdef CONFIG_TASKS_RCU_GENERIC
void tasks_cblist_init_generic(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
@@ -673,11 +669,6 @@ void show_rcu_tasks_rude_gp_kthread(void);
#else
static inline void show_rcu_tasks_rude_gp_kthread(void) {}
#endif
-#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_TRACE_RCU)
-void show_rcu_tasks_trace_gp_kthread(void);
-#else
-static inline void show_rcu_tasks_trace_gp_kthread(void) {}
-#endif
#ifdef CONFIG_TINY_RCU
static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; }
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 7484d8ad5767..1c50f89fbd6f 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -400,11 +400,6 @@ static void tasks_trace_scale_read_unlock(int idx)
rcu_read_unlock_trace();
}
-static void rcu_tasks_trace_scale_stats(void)
-{
- rcu_tasks_trace_torture_stats_print(scale_type, SCALE_FLAG);
-}
-
static struct rcu_scale_ops tasks_tracing_ops = {
.ptype = RCU_TASKS_FLAVOR,
.init = rcu_sync_scale_init,
@@ -416,8 +411,6 @@ static struct rcu_scale_ops tasks_tracing_ops = {
.gp_barrier = rcu_barrier_tasks_trace,
.sync = synchronize_rcu_tasks_trace,
.exp_sync = synchronize_rcu_tasks_trace,
- .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread,
- .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_trace_scale_stats,
.name = "tasks-tracing"
};
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 07e51974b06b..47ce7f49b52c 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1178,10 +1178,9 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.deferred_free = rcu_tasks_tracing_torture_deferred_free,
.sync = synchronize_rcu_tasks_trace,
.exp_sync = synchronize_rcu_tasks_trace,
+ .exp_current = rcu_tasks_trace_expedite_current,
.call = call_rcu_tasks_trace,
.cb_barrier = rcu_barrier_tasks_trace,
- .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread,
- .get_gp_data = rcu_tasks_trace_get_gp_data,
.cbflood_max = 50000,
.irq_capable = 1,
.slow_gps = 1,
@@ -1750,7 +1749,7 @@ rcu_torture_writer(void *arg)
ulo[i] = cur_ops->get_comp_state();
gp_snap = cur_ops->start_gp_poll();
rcu_torture_writer_state = RTWS_POLL_WAIT;
- if (cur_ops->exp_current && !torture_random(&rand) % 0xff)
+ if (cur_ops->exp_current && !(torture_random(&rand) & 0xff))
cur_ops->exp_current();
while (!cur_ops->poll_gp_state(gp_snap)) {
gp_snap1 = cur_ops->get_gp_state();
@@ -1772,7 +1771,7 @@ rcu_torture_writer(void *arg)
cur_ops->get_comp_state_full(&rgo[i]);
cur_ops->start_gp_poll_full(&gp_snap_full);
rcu_torture_writer_state = RTWS_POLL_WAIT_FULL;
- if (cur_ops->exp_current && !torture_random(&rand) % 0xff)
+ if (cur_ops->exp_current && !(torture_random(&rand) & 0xff))
cur_ops->exp_current();
while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
cur_ops->get_gp_state_full(&gp_snap1_full);
@@ -2455,6 +2454,9 @@ static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand);
*/
static void rcu_torture_timer(struct timer_list *unused)
{
+ WARN_ON_ONCE(!in_serving_softirq());
+ WARN_ON_ONCE(in_hardirq());
+ WARN_ON_ONCE(in_nmi());
atomic_long_inc(&n_rcu_torture_timers);
(void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand), -1);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index ea3f128de06f..c469c708fdd6 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -262,7 +262,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL;
ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
- if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC))
+ if (!init_srcu_struct_nodes(ssp, is_static ? GFP_ATOMIC : GFP_KERNEL))
goto err_free_sda;
WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
}
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 2dc044fd126e..76f952196a29 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -161,11 +161,6 @@ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall);
#endif
-/* Avoid IPIing CPUs early in the grace period. */
-#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0)
-static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY;
-module_param(rcu_task_ipi_delay, int, 0644);
-
/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
#define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30)
#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
@@ -718,7 +713,6 @@ static void __init rcu_tasks_bootup_oddness(void)
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
-
/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
{
@@ -801,9 +795,7 @@ static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *t
#endif // #ifndef CONFIG_TINY_RCU
-static void exit_tasks_rcu_finish_trace(struct task_struct *t);
-
-#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU)
+#if defined(CONFIG_TASKS_RCU)
////////////////////////////////////////////////////////////////////////
//
@@ -898,7 +890,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
rtp->postgp_func(rtp);
}
-#endif /* #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) */
+#endif /* #if defined(CONFIG_TASKS_RCU) */
#ifdef CONFIG_TASKS_RCU
@@ -1322,13 +1314,11 @@ void exit_tasks_rcu_finish(void)
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
list_del_init(&t->rcu_tasks_exit_list);
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
-
- exit_tasks_rcu_finish_trace(t);
}
#else /* #ifdef CONFIG_TASKS_RCU */
void exit_tasks_rcu_start(void) { }
-void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
+void exit_tasks_rcu_finish(void) { }
#endif /* #else #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_RUDE_RCU
@@ -1449,682 +1439,11 @@ EXPORT_SYMBOL_GPL(rcu_tasks_rude_get_gp_data);
#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */
-////////////////////////////////////////////////////////////////////////
-//
-// Tracing variant of Tasks RCU. This variant is designed to be used
-// to protect tracing hooks, including those of BPF. This variant
-// therefore:
-//
-// 1. Has explicit read-side markers to allow finite grace periods
-// in the face of in-kernel loops for PREEMPT=n builds.
-//
-// 2. Protects code in the idle loop, exception entry/exit, and
-// CPU-hotplug code paths, similar to the capabilities of SRCU.
-//
-// 3. Avoids expensive read-side instructions, having overhead similar
-// to that of Preemptible RCU.
-//
-// There are of course downsides. For example, the grace-period code
-// can send IPIs to CPUs, even when those CPUs are in the idle loop or
-// in nohz_full userspace. If needed, these downsides can be at least
-// partially remedied.
-//
-// Perhaps most important, this variant of RCU does not affect the vanilla
-// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace
-// readers can operate from idle, offline, and exception entry/exit in no
-// way allows rcu_preempt and rcu_sched readers to also do so.
-//
-// The implementation uses rcu_tasks_wait_gp(), which relies on function
-// pointers in the rcu_tasks structure. The rcu_spawn_tasks_trace_kthread()
-// function sets these function pointers up so that rcu_tasks_wait_gp()
-// invokes these functions in this order:
-//
-// rcu_tasks_trace_pregp_step():
-// Disables CPU hotplug, adds all currently executing tasks to the
-// holdout list, then checks the state of all tasks that blocked
-// or were preempted within their current RCU Tasks Trace read-side
-// critical section, adding them to the holdout list if appropriate.
-// Finally, this function re-enables CPU hotplug.
-// The ->pertask_func() pointer is NULL, so there is no per-task processing.
-// rcu_tasks_trace_postscan():
-// Invokes synchronize_rcu() to wait for late-stage exiting tasks
-// to finish exiting.
-// check_all_holdout_tasks_trace(), repeatedly until holdout list is empty:
-// Scans the holdout list, attempting to identify a quiescent state
-// for each task on the list. If there is a quiescent state, the
-// corresponding task is removed from the holdout list. Once this
-// list is empty, the grace period has completed.
-// rcu_tasks_trace_postgp():
-// Provides the needed full memory barrier and does debug checks.
-//
-// The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks.
-//
-// Pre-grace-period update-side code is ordered before the grace period
-// via the ->cbs_lock and barriers in rcu_tasks_kthread(). Pre-grace-period
-// read-side code is ordered before the grace period by atomic operations
-// on .b.need_qs flag of each task involved in this process, or by scheduler
-// context-switch ordering (for locked-down non-running readers).
-
-// The lockdep state must be outside of #ifdef to be useful.
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static struct lock_class_key rcu_lock_trace_key;
-struct lockdep_map rcu_trace_lock_map =
- STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_trace", &rcu_lock_trace_key);
-EXPORT_SYMBOL_GPL(rcu_trace_lock_map);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-#ifdef CONFIG_TASKS_TRACE_RCU
-
-// Record outstanding IPIs to each CPU. No point in sending two...
-static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
-
-// The number of detections of task quiescent state relying on
-// heavyweight readers executing explicit memory barriers.
-static unsigned long n_heavy_reader_attempts;
-static unsigned long n_heavy_reader_updates;
-static unsigned long n_heavy_reader_ofl_updates;
-static unsigned long n_trc_holdouts;
-
-void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func);
-DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace,
- "RCU Tasks Trace");
-
-/* Load from ->trc_reader_special.b.need_qs with proper ordering. */
-static u8 rcu_ld_need_qs(struct task_struct *t)
-{
- smp_mb(); // Enforce full grace-period ordering.
- return smp_load_acquire(&t->trc_reader_special.b.need_qs);
-}
-
-/* Store to ->trc_reader_special.b.need_qs with proper ordering. */
-static void rcu_st_need_qs(struct task_struct *t, u8 v)
-{
- smp_store_release(&t->trc_reader_special.b.need_qs, v);
- smp_mb(); // Enforce full grace-period ordering.
-}
-
-/*
- * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for
- * the four-byte operand-size restriction of some platforms.
- *
- * Returns the old value, which is often ignored.
- */
-u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
-{
- return cmpxchg(&t->trc_reader_special.b.need_qs, old, new);
-}
-EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs);
-
-/*
- * If we are the last reader, signal the grace-period kthread.
- * Also remove from the per-CPU list of blocked tasks.
- */
-void rcu_read_unlock_trace_special(struct task_struct *t)
-{
- unsigned long flags;
- struct rcu_tasks_percpu *rtpcp;
- union rcu_special trs;
-
- // Open-coded full-word version of rcu_ld_need_qs().
- smp_mb(); // Enforce full grace-period ordering.
- trs = smp_load_acquire(&t->trc_reader_special);
-
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb)
- smp_mb(); // Pairs with update-side barriers.
- // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers.
- if (trs.b.need_qs == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)) {
- u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS,
- TRC_NEED_QS_CHECKED);
-
- WARN_ONCE(result != trs.b.need_qs, "%s: result = %d", __func__, result);
- }
- if (trs.b.blocked) {
- rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, t->trc_blkd_cpu);
- raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
- list_del_init(&t->trc_blkd_node);
- WRITE_ONCE(t->trc_reader_special.b.blocked, false);
- raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
- }
- WRITE_ONCE(t->trc_reader_nesting, 0);
-}
-EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special);
-
-/* Add a newly blocked reader task to its CPU's list. */
-void rcu_tasks_trace_qs_blkd(struct task_struct *t)
-{
- unsigned long flags;
- struct rcu_tasks_percpu *rtpcp;
-
- local_irq_save(flags);
- rtpcp = this_cpu_ptr(rcu_tasks_trace.rtpcpu);
- raw_spin_lock_rcu_node(rtpcp); // irqs already disabled
- t->trc_blkd_cpu = smp_processor_id();
- if (!rtpcp->rtp_blkd_tasks.next)
- INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
- list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
- WRITE_ONCE(t->trc_reader_special.b.blocked, true);
- raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
-}
-EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd);
-
-/* Add a task to the holdout list, if it is not already on the list. */
-static void trc_add_holdout(struct task_struct *t, struct list_head *bhp)
-{
- if (list_empty(&t->trc_holdout_list)) {
- get_task_struct(t);
- list_add(&t->trc_holdout_list, bhp);
- n_trc_holdouts++;
- }
-}
-
-/* Remove a task from the holdout list, if it is in fact present. */
-static void trc_del_holdout(struct task_struct *t)
-{
- if (!list_empty(&t->trc_holdout_list)) {
- list_del_init(&t->trc_holdout_list);
- put_task_struct(t);
- n_trc_holdouts--;
- }
-}
-
-/* IPI handler to check task state. */
-static void trc_read_check_handler(void *t_in)
-{
- int nesting;
- struct task_struct *t = current;
- struct task_struct *texp = t_in;
-
- // If the task is no longer running on this CPU, leave.
- if (unlikely(texp != t))
- goto reset_ipi; // Already on holdout list, so will check later.
-
- // If the task is not in a read-side critical section, and
- // if this is the last reader, awaken the grace-period kthread.
- nesting = READ_ONCE(t->trc_reader_nesting);
- if (likely(!nesting)) {
- rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
- goto reset_ipi;
- }
- // If we are racing with an rcu_read_unlock_trace(), try again later.
- if (unlikely(nesting < 0))
- goto reset_ipi;
-
- // Get here if the task is in a read-side critical section.
- // Set its state so that it will update state for the grace-period
- // kthread upon exit from that critical section.
- rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED);
-
-reset_ipi:
- // Allow future IPIs to be sent on CPU and for task.
- // Also order this IPI handler against any later manipulations of
- // the intended task.
- smp_store_release(per_cpu_ptr(&trc_ipi_to_cpu, smp_processor_id()), false); // ^^^
- smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^
-}
-
-/* Callback function for scheduler to check locked-down task. */
-static int trc_inspect_reader(struct task_struct *t, void *bhp_in)
-{
- struct list_head *bhp = bhp_in;
- int cpu = task_cpu(t);
- int nesting;
- bool ofl = cpu_is_offline(cpu);
-
- if (task_curr(t) && !ofl) {
- // If no chance of heavyweight readers, do it the hard way.
- if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
- return -EINVAL;
-
- // If heavyweight readers are enabled on the remote task,
- // we can inspect its state despite its currently running.
- // However, we cannot safely change its state.
- n_heavy_reader_attempts++;
- // Check for "running" idle tasks on offline CPUs.
- if (!rcu_watching_zero_in_eqs(cpu, &t->trc_reader_nesting))
- return -EINVAL; // No quiescent state, do it the hard way.
- n_heavy_reader_updates++;
- nesting = 0;
- } else {
- // The task is not running, so C-language access is safe.
- nesting = t->trc_reader_nesting;
- WARN_ON_ONCE(ofl && task_curr(t) && (t != idle_task(task_cpu(t))));
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl)
- n_heavy_reader_ofl_updates++;
- }
-
- // If not exiting a read-side critical section, mark as checked
- // so that the grace-period kthread will remove it from the
- // holdout list.
- if (!nesting) {
- rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
- return 0; // In QS, so done.
- }
- if (nesting < 0)
- return -EINVAL; // Reader transitioning, try again later.
-
- // The task is in a read-side critical section, so set up its
- // state so that it will update state upon exit from that critical
- // section.
- if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED))
- trc_add_holdout(t, bhp);
- return 0;
-}
-
-/* Attempt to extract the state for the specified task. */
-static void trc_wait_for_one_reader(struct task_struct *t,
- struct list_head *bhp)
-{
- int cpu;
-
- // If a previous IPI is still in flight, let it complete.
- if (smp_load_acquire(&t->trc_ipi_to_cpu) != -1) // Order IPI
- return;
-
- // The current task had better be in a quiescent state.
- if (t == current) {
- rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
- WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
- return;
- }
-
- // Attempt to nail down the task for inspection.
- get_task_struct(t);
- if (!task_call_func(t, trc_inspect_reader, bhp)) {
- put_task_struct(t);
- return;
- }
- put_task_struct(t);
-
- // If this task is not yet on the holdout list, then we are in
- // an RCU read-side critical section. Otherwise, the invocation of
- // trc_add_holdout() that added it to the list did the necessary
- // get_task_struct(). Either way, the task cannot be freed out
- // from under this code.
-
- // If currently running, send an IPI, either way, add to list.
- trc_add_holdout(t, bhp);
- if (task_curr(t) &&
- time_after(jiffies + 1, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) {
- // The task is currently running, so try IPIing it.
- cpu = task_cpu(t);
-
- // If there is already an IPI outstanding, let it happen.
- if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0)
- return;
-
- per_cpu(trc_ipi_to_cpu, cpu) = true;
- t->trc_ipi_to_cpu = cpu;
- rcu_tasks_trace.n_ipis++;
- if (smp_call_function_single(cpu, trc_read_check_handler, t, 0)) {
- // Just in case there is some other reason for
- // failure than the target CPU being offline.
- WARN_ONCE(1, "%s(): smp_call_function_single() failed for CPU: %d\n",
- __func__, cpu);
- rcu_tasks_trace.n_ipis_fails++;
- per_cpu(trc_ipi_to_cpu, cpu) = false;
- t->trc_ipi_to_cpu = -1;
- }
- }
-}
-
-/*
- * Initialize for first-round processing for the specified task.
- * Return false if task is NULL or already taken care of, true otherwise.
- */
-static bool rcu_tasks_trace_pertask_prep(struct task_struct *t, bool notself)
-{
- // During early boot when there is only the one boot CPU, there
- // is no idle task for the other CPUs. Also, the grace-period
- // kthread is always in a quiescent state. In addition, just return
- // if this task is already on the list.
- if (unlikely(t == NULL) || (t == current && notself) || !list_empty(&t->trc_holdout_list))
- return false;
-
- rcu_st_need_qs(t, 0);
- t->trc_ipi_to_cpu = -1;
- return true;
-}
-
-/* Do first-round processing for the specified task. */
-static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop)
-{
- if (rcu_tasks_trace_pertask_prep(t, true))
- trc_wait_for_one_reader(t, hop);
-}
-
-/* Initialize for a new RCU-tasks-trace grace period. */
-static void rcu_tasks_trace_pregp_step(struct list_head *hop)
-{
- LIST_HEAD(blkd_tasks);
- int cpu;
- unsigned long flags;
- struct rcu_tasks_percpu *rtpcp;
- struct task_struct *t;
-
- // There shouldn't be any old IPIs, but...
- for_each_possible_cpu(cpu)
- WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu));
-
- // Disable CPU hotplug across the CPU scan for the benefit of
- // any IPIs that might be needed. This also waits for all readers
- // in CPU-hotplug code paths.
- cpus_read_lock();
-
- // These rcu_tasks_trace_pertask_prep() calls are serialized to
- // allow safe access to the hop list.
- for_each_online_cpu(cpu) {
- rcu_read_lock();
- // Note that cpu_curr_snapshot() picks up the target
- // CPU's current task while its runqueue is locked with
- // an smp_mb__after_spinlock(). This ensures that either
- // the grace-period kthread will see that task's read-side
- // critical section or the task will see the updater's pre-GP
- // accesses. The trailing smp_mb() in cpu_curr_snapshot()
- // does not currently play a role other than simplify
- // that function's ordering semantics. If these simplified
- // ordering semantics continue to be redundant, that smp_mb()
- // might be removed.
- t = cpu_curr_snapshot(cpu);
- if (rcu_tasks_trace_pertask_prep(t, true))
- trc_add_holdout(t, hop);
- rcu_read_unlock();
- cond_resched_tasks_rcu_qs();
- }
-
- // Only after all running tasks have been accounted for is it
- // safe to take care of the tasks that have blocked within their
- // current RCU tasks trace read-side critical section.
- for_each_possible_cpu(cpu) {
- rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, cpu);
- raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
- list_splice_init(&rtpcp->rtp_blkd_tasks, &blkd_tasks);
- while (!list_empty(&blkd_tasks)) {
- rcu_read_lock();
- t = list_first_entry(&blkd_tasks, struct task_struct, trc_blkd_node);
- list_del_init(&t->trc_blkd_node);
- list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
- raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
- rcu_tasks_trace_pertask(t, hop);
- rcu_read_unlock();
- raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
- }
- raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
- cond_resched_tasks_rcu_qs();
- }
-
- // Re-enable CPU hotplug now that the holdout list is populated.
- cpus_read_unlock();
-}
-
-/*
- * Do intermediate processing between task and holdout scans.
- */
-static void rcu_tasks_trace_postscan(struct list_head *hop)
-{
- // Wait for late-stage exiting tasks to finish exiting.
- // These might have passed the call to exit_tasks_rcu_finish().
-
- // If you remove the following line, update rcu_trace_implies_rcu_gp()!!!
- synchronize_rcu();
- // Any tasks that exit after this point will set
- // TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
-}
-
-/* Communicate task state back to the RCU tasks trace stall warning request. */
-struct trc_stall_chk_rdr {
- int nesting;
- int ipi_to_cpu;
- u8 needqs;
-};
-
-static int trc_check_slow_task(struct task_struct *t, void *arg)
-{
- struct trc_stall_chk_rdr *trc_rdrp = arg;
-
- if (task_curr(t) && cpu_online(task_cpu(t)))
- return false; // It is running, so decline to inspect it.
- trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting);
- trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu);
- trc_rdrp->needqs = rcu_ld_need_qs(t);
- return true;
-}
-
-/* Show the state of a task stalling the current RCU tasks trace GP. */
-static void show_stalled_task_trace(struct task_struct *t, bool *firstreport)
-{
- int cpu;
- struct trc_stall_chk_rdr trc_rdr;
- bool is_idle_tsk = is_idle_task(t);
-
- if (*firstreport) {
- pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n");
- *firstreport = false;
- }
- cpu = task_cpu(t);
- if (!task_call_func(t, trc_check_slow_task, &trc_rdr))
- pr_alert("P%d: %c%c\n",
- t->pid,
- ".I"[t->trc_ipi_to_cpu >= 0],
- ".i"[is_idle_tsk]);
- else
- pr_alert("P%d: %c%c%c%c nesting: %d%c%c cpu: %d%s\n",
- t->pid,
- ".I"[trc_rdr.ipi_to_cpu >= 0],
- ".i"[is_idle_tsk],
- ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)],
- ".B"[!!data_race(t->trc_reader_special.b.blocked)],
- trc_rdr.nesting,
- " !CN"[trc_rdr.needqs & 0x3],
- " ?"[trc_rdr.needqs > 0x3],
- cpu, cpu_online(cpu) ? "" : "(offline)");
- sched_show_task(t);
-}
-
-/* List stalled IPIs for RCU tasks trace. */
-static void show_stalled_ipi_trace(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- if (per_cpu(trc_ipi_to_cpu, cpu))
- pr_alert("\tIPI outstanding to CPU %d\n", cpu);
-}
-
-/* Do one scan of the holdout list. */
-static void check_all_holdout_tasks_trace(struct list_head *hop,
- bool needreport, bool *firstreport)
-{
- struct task_struct *g, *t;
-
- // Disable CPU hotplug across the holdout list scan for IPIs.
- cpus_read_lock();
-
- list_for_each_entry_safe(t, g, hop, trc_holdout_list) {
- // If safe and needed, try to check the current task.
- if (READ_ONCE(t->trc_ipi_to_cpu) == -1 &&
- !(rcu_ld_need_qs(t) & TRC_NEED_QS_CHECKED))
- trc_wait_for_one_reader(t, hop);
-
- // If check succeeded, remove this task from the list.
- if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 &&
- rcu_ld_need_qs(t) == TRC_NEED_QS_CHECKED)
- trc_del_holdout(t);
- else if (needreport)
- show_stalled_task_trace(t, firstreport);
- cond_resched_tasks_rcu_qs();
- }
-
- // Re-enable CPU hotplug now that the holdout list scan has completed.
- cpus_read_unlock();
-
- if (needreport) {
- if (*firstreport)
- pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n");
- show_stalled_ipi_trace();
- }
-}
-
-static void rcu_tasks_trace_empty_fn(void *unused)
-{
-}
-
-/* Wait for grace period to complete and provide ordering. */
-static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
-{
- int cpu;
-
- // Wait for any lingering IPI handlers to complete. Note that
- // if a CPU has gone offline or transitioned to userspace in the
- // meantime, all IPI handlers should have been drained beforehand.
- // Yes, this assumes that CPUs process IPIs in order. If that ever
- // changes, there will need to be a recheck and/or timed wait.
- for_each_online_cpu(cpu)
- if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu))))
- smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1);
-
- smp_mb(); // Caller's code must be ordered after wakeup.
- // Pairs with pretty much every ordering primitive.
-}
-
-/* Report any needed quiescent state for this exiting task. */
-static void exit_tasks_rcu_finish_trace(struct task_struct *t)
-{
- union rcu_special trs = READ_ONCE(t->trc_reader_special);
-
- rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
- WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
- if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS || trs.b.blocked))
- rcu_read_unlock_trace_special(t);
- else
- WRITE_ONCE(t->trc_reader_nesting, 0);
-}
-
-/**
- * call_rcu_tasks_trace() - Queue a callback trace task-based grace period
- * @rhp: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a trace rcu-tasks
- * grace period elapses, in other words after all currently executing
- * trace rcu-tasks read-side critical sections have completed. These
- * read-side critical sections are delimited by calls to rcu_read_lock_trace()
- * and rcu_read_unlock_trace().
- *
- * See the description of call_rcu() for more detailed information on
- * memory ordering guarantees.
- */
-void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func)
-{
- call_rcu_tasks_generic(rhp, func, &rcu_tasks_trace);
-}
-EXPORT_SYMBOL_GPL(call_rcu_tasks_trace);
-
-/**
- * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period
- *
- * Control will return to the caller some time after a trace rcu-tasks
- * grace period has elapsed, in other words after all currently executing
- * trace rcu-tasks read-side critical sections have elapsed. These read-side
- * critical sections are delimited by calls to rcu_read_lock_trace()
- * and rcu_read_unlock_trace().
- *
- * This is a very specialized primitive, intended only for a few uses in
- * tracing and other situations requiring manipulation of function preambles
- * and profiling hooks. The synchronize_rcu_tasks_trace() function is not
- * (yet) intended for heavy use from multiple CPUs.
- *
- * See the description of synchronize_rcu() for more detailed information
- * on memory ordering guarantees.
- */
-void synchronize_rcu_tasks_trace(void)
-{
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_trace_lock_map), "Illegal synchronize_rcu_tasks_trace() in RCU Tasks Trace read-side critical section");
- synchronize_rcu_tasks_generic(&rcu_tasks_trace);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace);
-
-/**
- * rcu_barrier_tasks_trace - Wait for in-flight call_rcu_tasks_trace() callbacks.
- *
- * Although the current implementation is guaranteed to wait, it is not
- * obligated to, for example, if there are no pending callbacks.
- */
-void rcu_barrier_tasks_trace(void)
-{
- rcu_barrier_tasks_generic(&rcu_tasks_trace);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace);
-
-int rcu_tasks_trace_lazy_ms = -1;
-module_param(rcu_tasks_trace_lazy_ms, int, 0444);
-
-static int __init rcu_spawn_tasks_trace_kthread(void)
-{
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) {
- rcu_tasks_trace.gp_sleep = HZ / 10;
- rcu_tasks_trace.init_fract = HZ / 10;
- } else {
- rcu_tasks_trace.gp_sleep = HZ / 200;
- if (rcu_tasks_trace.gp_sleep <= 0)
- rcu_tasks_trace.gp_sleep = 1;
- rcu_tasks_trace.init_fract = HZ / 200;
- if (rcu_tasks_trace.init_fract <= 0)
- rcu_tasks_trace.init_fract = 1;
- }
- if (rcu_tasks_trace_lazy_ms >= 0)
- rcu_tasks_trace.lazy_jiffies = msecs_to_jiffies(rcu_tasks_trace_lazy_ms);
- rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step;
- rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan;
- rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace;
- rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp;
- rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace);
- return 0;
-}
-
-#if !defined(CONFIG_TINY_RCU)
-void show_rcu_tasks_trace_gp_kthread(void)
-{
- char buf[64];
-
- snprintf(buf, sizeof(buf), "N%lu h:%lu/%lu/%lu",
- data_race(n_trc_holdouts),
- data_race(n_heavy_reader_ofl_updates),
- data_race(n_heavy_reader_updates),
- data_race(n_heavy_reader_attempts));
- show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf);
-}
-EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread);
-
-void rcu_tasks_trace_torture_stats_print(char *tt, char *tf)
-{
- rcu_tasks_torture_stats_print_generic(&rcu_tasks_trace, tt, tf, "");
-}
-EXPORT_SYMBOL_GPL(rcu_tasks_trace_torture_stats_print);
-#endif // !defined(CONFIG_TINY_RCU)
-
-struct task_struct *get_rcu_tasks_trace_gp_kthread(void)
-{
- return rcu_tasks_trace.kthread_ptr;
-}
-EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread);
-
-void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq)
-{
- *flags = 0;
- *gp_seq = rcu_seq_current(&rcu_tasks_trace.tasks_gp_seq);
-}
-EXPORT_SYMBOL_GPL(rcu_tasks_trace_get_gp_data);
-
-#else /* #ifdef CONFIG_TASKS_TRACE_RCU */
-static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
-#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
-
#ifndef CONFIG_TINY_RCU
void show_rcu_tasks_gp_kthreads(void)
{
show_rcu_tasks_classic_gp_kthread();
show_rcu_tasks_rude_gp_kthread();
- show_rcu_tasks_trace_gp_kthread();
}
#endif /* #ifndef CONFIG_TINY_RCU */
@@ -2251,10 +1570,6 @@ void __init tasks_cblist_init_generic(void)
#ifdef CONFIG_TASKS_RUDE_RCU
cblist_init_generic(&rcu_tasks_rude);
#endif
-
-#ifdef CONFIG_TASKS_TRACE_RCU
- cblist_init_generic(&rcu_tasks_trace);
-#endif
}
static int __init rcu_init_tasks_generic(void)
@@ -2267,10 +1582,6 @@ static int __init rcu_init_tasks_generic(void)
rcu_spawn_tasks_rude_kthread();
#endif
-#ifdef CONFIG_TASKS_TRACE_RCU
- rcu_spawn_tasks_trace_kthread();
-#endif
-
// Run the self-tests.
rcu_tasks_initiate_self_tests();
@@ -2281,3 +1592,16 @@ core_initcall(rcu_init_tasks_generic);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void rcu_tasks_bootup_oddness(void) {}
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
+////////////////////////////////////////////////////////////////////////
+//
+// Tracing variant of Tasks RCU. This variant is designed to be used
+// to protect tracing hooks, including those of BPF. This variant
+// is implemented via a straightforward mapping onto SRCU-fast.
+
+DEFINE_SRCU_FAST(rcu_tasks_trace_srcu_struct);
+EXPORT_SYMBOL_GPL(rcu_tasks_trace_srcu_struct);
+
+#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 293bbd9ac3f4..55df6d37145e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -160,6 +160,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
unsigned long gps, unsigned long flags);
static void invoke_rcu_core(void);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
+static void rcu_report_qs_rdp(struct rcu_data *rdp);
static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
static bool rcu_rdp_cpu_online(struct rcu_data *rdp);
@@ -1983,6 +1984,17 @@ static noinline_for_stack bool rcu_gp_init(void)
if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
+ /*
+ * Immediately report QS for the GP kthread's CPU. The GP kthread
+ * cannot be in an RCU read-side critical section while running
+ * the FQS scan. This eliminates the need for a second FQS wait
+ * when all CPUs are idle.
+ */
+ preempt_disable();
+ rcu_qs();
+ rcu_report_qs_rdp(this_cpu_ptr(&rcu_data));
+ preempt_enable();
+
return true;
}
@@ -3769,7 +3781,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
}
rcu_nocb_unlock(rdp);
if (wake_nocb)
- wake_nocb_gp(rdp, false);
+ wake_nocb_gp(rdp);
smp_store_release(&rdp->barrier_seq_snap, gseq);
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b8bbe7960cda..7dfc57e9adb1 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -203,7 +203,7 @@ struct rcu_data {
/* during and after the last grace */
/* period it is aware of. */
struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */
- int defer_qs_iw_pending; /* Scheduler attention pending? */
+ int defer_qs_pending; /* irqwork or softirq pending? */
struct work_struct strict_work; /* Schedule readers for strict GPs. */
/* 2) batch handling */
@@ -301,7 +301,6 @@ struct rcu_data {
#define RCU_NOCB_WAKE_BYPASS 1
#define RCU_NOCB_WAKE_LAZY 2
#define RCU_NOCB_WAKE 3
-#define RCU_NOCB_WAKE_FORCE 4
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
/* For jiffies_till_first_fqs and */
@@ -500,7 +499,7 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
-static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
+static bool wake_nocb_gp(struct rcu_data *rdp);
static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
unsigned long j, bool lazy);
static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 96c49c56fc14..82cada459e5d 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -589,7 +589,12 @@ static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigne
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask),
".T"[!!data_race(rnp_root->exp_tasks)]);
- if (ndetected) {
+ if (!ndetected) {
+ // This is invoked from the grace-period worker, so
+ // a new grace period cannot have started. And if this
+ // worker were stalled, we would not get here. ;-)
+ pr_err("INFO: Expedited stall ended before state dump start\n");
+ } else {
pr_err("blocking rcu_node structures (internal RCU debug):");
rcu_for_each_node_breadth_first(rnp) {
if (rnp == rnp_root)
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index e6cd56603cad..b3337c7231cc 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -190,9 +190,18 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
init_swait_queue_head(&rnp->nocb_gp_wq[1]);
}
+/* Clear any pending deferred wakeup timer (nocb_gp_lock must be held). */
+static void nocb_defer_wakeup_cancel(struct rcu_data *rdp_gp)
+{
+ if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ timer_delete(&rdp_gp->nocb_timer);
+ }
+}
+
static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
struct rcu_data *rdp,
- bool force, unsigned long flags)
+ unsigned long flags)
__releases(rdp_gp->nocb_gp_lock)
{
bool needwake = false;
@@ -204,12 +213,9 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
return false;
}
- if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
- WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- timer_delete(&rdp_gp->nocb_timer);
- }
+ nocb_defer_wakeup_cancel(rdp_gp);
- if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
+ if (READ_ONCE(rdp_gp->nocb_gp_sleep)) {
WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
needwake = true;
}
@@ -225,13 +231,13 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
/*
* Kick the GP kthread for this NOCB group.
*/
-static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+static bool wake_nocb_gp(struct rcu_data *rdp)
{
unsigned long flags;
struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
- return __wake_nocb_gp(rdp_gp, rdp, force, flags);
+ return __wake_nocb_gp(rdp_gp, rdp, flags);
}
#ifdef CONFIG_RCU_LAZY
@@ -518,22 +524,17 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
}
/*
- * Awaken the no-CBs grace-period kthread if needed, either due to it
- * legitimately being asleep or due to overload conditions.
- *
- * If warranted, also wake up the kthread servicing this CPUs queues.
+ * Awaken the no-CBs grace-period kthread if needed due to it legitimately
+ * being asleep.
*/
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
unsigned long flags)
__releases(rdp->nocb_lock)
{
long bypass_len;
- unsigned long cur_gp_seq;
- unsigned long j;
long lazy_len;
long len;
struct task_struct *t;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
// If we are being polled or there is no kthread, just leave.
t = READ_ONCE(rdp->nocb_gp_kthread);
@@ -549,47 +550,26 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
lazy_len = READ_ONCE(rdp->lazy_len);
if (was_alldone) {
rdp->qlen_last_fqs_check = len;
+ rcu_nocb_unlock(rdp);
// Only lazy CBs in bypass list
if (lazy_len && bypass_len == lazy_len) {
- rcu_nocb_unlock(rdp);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
TPS("WakeLazy"));
} else if (!irqs_disabled_flags(flags)) {
/* ... if queue was empty ... */
- rcu_nocb_unlock(rdp);
- wake_nocb_gp(rdp, false);
+ wake_nocb_gp(rdp);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- rcu_nocb_unlock(rdp);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
TPS("WakeEmptyIsDeferred"));
}
- } else if (len > rdp->qlen_last_fqs_check + qhimark) {
- /* ... or if many callbacks queued. */
- rdp->qlen_last_fqs_check = len;
- j = jiffies;
- if (j != rdp->nocb_gp_adv_time &&
- rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
- rcu_advance_cbs_nowake(rdp->mynode, rdp);
- rdp->nocb_gp_adv_time = j;
- }
- smp_mb(); /* Enqueue before timer_pending(). */
- if ((rdp->nocb_cb_sleep ||
- !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
- !timer_pending(&rdp_gp->nocb_timer)) {
- rcu_nocb_unlock(rdp);
- wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
- TPS("WakeOvfIsDeferred"));
- } else {
- rcu_nocb_unlock(rdp);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
- }
- } else {
- rcu_nocb_unlock(rdp);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+
+ return;
}
+
+ rcu_nocb_unlock(rdp);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
}
static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
@@ -814,10 +794,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
if (rdp_toggling)
my_rdp->nocb_toggling_rdp = NULL;
- if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
- WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- timer_delete(&my_rdp->nocb_timer);
- }
+ nocb_defer_wakeup_cancel(my_rdp);
WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
} else {
@@ -966,7 +943,6 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
unsigned long flags)
__releases(rdp_gp->nocb_gp_lock)
{
- int ndw;
int ret;
if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) {
@@ -974,8 +950,7 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
return false;
}
- ndw = rdp_gp->nocb_defer_wakeup;
- ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ ret = __wake_nocb_gp(rdp_gp, rdp, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
return ret;
@@ -991,7 +966,6 @@ static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags);
- smp_mb__after_spinlock(); /* Timer expire before wakeup. */
do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags);
}
@@ -1272,7 +1246,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
}
rcu_nocb_try_flush_bypass(rdp, jiffies);
rcu_nocb_unlock_irqrestore(rdp, flags);
- wake_nocb_gp(rdp, false);
+ wake_nocb_gp(rdp);
sc->nr_to_scan -= _count;
count += _count;
if (sc->nr_to_scan <= 0)
@@ -1657,7 +1631,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
{
}
-static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+static bool wake_nocb_gp(struct rcu_data *rdp)
{
return false;
}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index dbe2d02be824..95ad967adcf3 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -487,8 +487,8 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
union rcu_special special;
rdp = this_cpu_ptr(&rcu_data);
- if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING)
- rdp->defer_qs_iw_pending = DEFER_QS_IDLE;
+ if (rdp->defer_qs_pending == DEFER_QS_PENDING)
+ rdp->defer_qs_pending = DEFER_QS_IDLE;
/*
* If RCU core is waiting for this CPU to exit its critical section,
@@ -645,7 +645,7 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
* 5. Deferred QS reporting does not happen.
*/
if (rcu_preempt_depth() > 0)
- WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE);
+ WRITE_ONCE(rdp->defer_qs_pending, DEFER_QS_IDLE);
}
/*
@@ -747,7 +747,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
// Using softirq, safe to awaken, and either the
// wakeup is free or there is either an expedited
// GP in flight or a potential need to deboost.
- raise_softirq_irqoff(RCU_SOFTIRQ);
+ if (rdp->defer_qs_pending != DEFER_QS_PENDING) {
+ rdp->defer_qs_pending = DEFER_QS_PENDING;
+ raise_softirq_irqoff(RCU_SOFTIRQ);
+ }
} else {
// Enabling BH or preempt does reschedule, so...
// Also if no expediting and no possible deboosting,
@@ -755,11 +758,11 @@ static void rcu_read_unlock_special(struct task_struct *t)
// tick enabled.
set_need_resched_current();
if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
- needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING &&
+ needs_exp && rdp->defer_qs_pending != DEFER_QS_PENDING &&
cpu_online(rdp->cpu)) {
// Get scheduler to re-evaluate and call hooks.
// If !IRQ_WORK, FQS scan will eventually IPI.
- rdp->defer_qs_iw_pending = DEFER_QS_PENDING;
+ rdp->defer_qs_pending = DEFER_QS_PENDING;
irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
}
}
diff --git a/kernel/resource.c b/kernel/resource.c
index e4e9bac12e6e..31341bdd7707 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -48,6 +48,14 @@ struct resource iomem_resource = {
};
EXPORT_SYMBOL(iomem_resource);
+struct resource soft_reserve_resource = {
+ .name = "Soft Reserved",
+ .start = 0,
+ .end = -1,
+ .desc = IORES_DESC_SOFT_RESERVED,
+ .flags = IORESOURCE_MEM,
+};
+
static DEFINE_RWLOCK(resource_lock);
/*
@@ -82,7 +90,7 @@ static struct resource *next_resource(struct resource *p, bool skip_children,
#ifdef CONFIG_PROC_FS
-enum { MAX_IORES_LEVEL = 5 };
+enum { MAX_IORES_LEVEL = 8 };
static void *r_start(struct seq_file *m, loff_t *pos)
__acquires(resource_lock)
@@ -321,13 +329,14 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long
}
/**
- * find_next_iomem_res - Finds the lowest iomem resource that covers part of
- * [@start..@end].
+ * find_next_res - Finds the lowest resource that covers part of
+ * [@start..@end].
*
* If a resource is found, returns 0 and @*res is overwritten with the part
* of the resource that's within [@start..@end]; if none is found, returns
* -ENODEV. Returns -EINVAL for invalid parameters.
*
+ * @parent: resource tree root to search
* @start: start address of the resource searched for
* @end: end address of same resource
* @flags: flags which the resource must have
@@ -337,9 +346,9 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long
* The caller must specify @start, @end, @flags, and @desc
* (which may be IORES_DESC_NONE).
*/
-static int find_next_iomem_res(resource_size_t start, resource_size_t end,
- unsigned long flags, unsigned long desc,
- struct resource *res)
+static int find_next_res(struct resource *parent, resource_size_t start,
+ resource_size_t end, unsigned long flags,
+ unsigned long desc, struct resource *res)
{
/* Skip children until we find a top level range that matches */
bool skip_children = true;
@@ -353,7 +362,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
read_lock(&resource_lock);
- for_each_resource(&iomem_resource, p, skip_children) {
+ for_each_resource(parent, p, skip_children) {
/* If we passed the resource we are looking for, stop */
if (p->start > end) {
p = NULL;
@@ -390,16 +399,23 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
return p ? 0 : -ENODEV;
}
-static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
- unsigned long flags, unsigned long desc,
- void *arg,
- int (*func)(struct resource *, void *))
+static int find_next_iomem_res(resource_size_t start, resource_size_t end,
+ unsigned long flags, unsigned long desc,
+ struct resource *res)
+{
+ return find_next_res(&iomem_resource, start, end, flags, desc, res);
+}
+
+static int walk_res_desc(struct resource *parent, resource_size_t start,
+ resource_size_t end, unsigned long flags,
+ unsigned long desc, void *arg,
+ int (*func)(struct resource *, void *))
{
struct resource res;
int ret = -EINVAL;
while (start < end &&
- !find_next_iomem_res(start, end, flags, desc, &res)) {
+ !find_next_res(parent, start, end, flags, desc, &res)) {
ret = (*func)(&res, arg);
if (ret)
break;
@@ -410,6 +426,15 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
return ret;
}
+static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
+ unsigned long flags, unsigned long desc,
+ void *arg,
+ int (*func)(struct resource *, void *))
+{
+ return walk_res_desc(&iomem_resource, start, end, flags, desc, arg, func);
+}
+
+
/**
* walk_iomem_res_desc - Walks through iomem resources and calls func()
* with matching resource ranges.
@@ -435,6 +460,18 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
/*
+ * In support of device drivers claiming Soft Reserved resources, walk the Soft
+ * Reserved resource deferral tree.
+ */
+int walk_soft_reserve_res(u64 start, u64 end, void *arg,
+ int (*func)(struct resource *, void *))
+{
+ return walk_res_desc(&soft_reserve_resource, start, end, IORESOURCE_MEM,
+ IORES_DESC_SOFT_RESERVED, arg, func);
+}
+EXPORT_SYMBOL_GPL(walk_soft_reserve_res);
+
+/*
* This function calls the @func callback against all memory ranges of type
* System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
* Now, this function is only for System RAM, it deals with full ranges and
@@ -656,6 +693,18 @@ int region_intersects(resource_size_t start, size_t size, unsigned long flags,
}
EXPORT_SYMBOL_GPL(region_intersects);
+/*
+ * Check if the provided range is registered in the Soft Reserved resource
+ * deferral tree for driver consideration.
+ */
+int region_intersects_soft_reserve(resource_size_t start, size_t size)
+{
+ guard(read_lock)(&resource_lock);
+ return __region_intersects(&soft_reserve_resource, start, size,
+ IORESOURCE_MEM, IORES_DESC_SOFT_RESERVED);
+}
+EXPORT_SYMBOL_GPL(region_intersects_soft_reserve);
+
void __weak arch_remove_reservations(struct resource *avail)
{
}
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 395d8b002350..b0973d19f366 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -71,6 +71,9 @@
#define RSEQ_BUILD_SLOW_PATH
#include <linux/debugfs.h>
+#include <linux/hrtimer.h>
+#include <linux/percpu.h>
+#include <linux/prctl.h>
#include <linux/ratelimit.h>
#include <linux/rseq_entry.h>
#include <linux/sched.h>
@@ -120,7 +123,6 @@ void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
}
#endif /* CONFIG_TRACEPOINTS */
-#ifdef CONFIG_DEBUG_FS
#ifdef CONFIG_RSEQ_STATS
DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
@@ -138,6 +140,13 @@ static int rseq_stats_show(struct seq_file *m, void *p)
stats.cs += data_race(per_cpu(rseq_stats.cs, cpu));
stats.clear += data_race(per_cpu(rseq_stats.clear, cpu));
stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu));
+ if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
+ stats.s_granted += data_race(per_cpu(rseq_stats.s_granted, cpu));
+ stats.s_expired += data_race(per_cpu(rseq_stats.s_expired, cpu));
+ stats.s_revoked += data_race(per_cpu(rseq_stats.s_revoked, cpu));
+ stats.s_yielded += data_race(per_cpu(rseq_stats.s_yielded, cpu));
+ stats.s_aborted += data_race(per_cpu(rseq_stats.s_aborted, cpu));
+ }
}
seq_printf(m, "exit: %16lu\n", stats.exit);
@@ -148,6 +157,13 @@ static int rseq_stats_show(struct seq_file *m, void *p)
seq_printf(m, "cs: %16lu\n", stats.cs);
seq_printf(m, "clear: %16lu\n", stats.clear);
seq_printf(m, "fixup: %16lu\n", stats.fixup);
+ if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
+ seq_printf(m, "sgrant: %16lu\n", stats.s_granted);
+ seq_printf(m, "sexpir: %16lu\n", stats.s_expired);
+ seq_printf(m, "srevok: %16lu\n", stats.s_revoked);
+ seq_printf(m, "syield: %16lu\n", stats.s_yielded);
+ seq_printf(m, "sabort: %16lu\n", stats.s_aborted);
+ }
return 0;
}
@@ -205,16 +221,19 @@ static const struct file_operations debug_ops = {
.release = single_release,
};
+static void rseq_slice_ext_init(struct dentry *root_dir);
+
static int __init rseq_debugfs_init(void)
{
struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops);
rseq_stats_init(root_dir);
+ if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION))
+ rseq_slice_ext_init(root_dir);
return 0;
}
__initcall(rseq_debugfs_init);
-#endif /* CONFIG_DEBUG_FS */
static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id)
{
@@ -389,6 +408,8 @@ static bool rseq_reset_ids(void)
*/
SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
{
+ u32 rseqfl = 0;
+
if (flags & RSEQ_FLAG_UNREGISTER) {
if (flags & ~RSEQ_FLAG_UNREGISTER)
return -EINVAL;
@@ -405,7 +426,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
return 0;
}
- if (unlikely(flags))
+ if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)))
return -EINVAL;
if (current->rseq.usrptr) {
@@ -440,6 +461,13 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
if (!access_ok(rseq, rseq_len))
return -EFAULT;
+ if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
+ rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
+ if (rseq_slice_extension_enabled() &&
+ (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))
+ rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+ }
+
scoped_user_write_access(rseq, efault) {
/*
* If the rseq_cs pointer is non-NULL on registration, clear it to
@@ -449,11 +477,13 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
* clearing the fields. Don't bother reading it, just reset it.
*/
unsafe_put_user(0UL, &rseq->rseq_cs, efault);
+ unsafe_put_user(rseqfl, &rseq->flags, efault);
/* Initialize IDs in user space */
unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault);
unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
unsafe_put_user(0U, &rseq->node_id, efault);
unsafe_put_user(0U, &rseq->mm_cid, efault);
+ unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
}
/*
@@ -464,6 +494,10 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
current->rseq.len = rseq_len;
current->rseq.sig = sig;
+#ifdef CONFIG_RSEQ_SLICE_EXTENSION
+ current->rseq.slice.state.enabled = !!(rseqfl & RSEQ_CS_FLAG_SLICE_EXT_ENABLED);
+#endif
+
/*
* If rseq was previously inactive, and has just been
* registered, ensure the cpu_id_start and cpu_id fields
@@ -476,3 +510,328 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
efault:
return -EFAULT;
}
+
+#ifdef CONFIG_RSEQ_SLICE_EXTENSION
+struct slice_timer {
+ struct hrtimer timer;
+ void *cookie;
+};
+
+static const unsigned int rseq_slice_ext_nsecs_min = 5 * NSEC_PER_USEC;
+static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC;
+unsigned int rseq_slice_ext_nsecs __read_mostly = rseq_slice_ext_nsecs_min;
+static DEFINE_PER_CPU(struct slice_timer, slice_timer);
+DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
+
+/*
+ * When the timer expires and the task is still in user space, the return
+ * from interrupt will revoke the grant and schedule. If the task already
+ * entered the kernel via a syscall and the timer fires before the syscall
+ * work was able to cancel it, then depending on the preemption model this
+ * will either reschedule on return from interrupt or in the syscall work
+ * below.
+ */
+static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
+{
+ struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
+
+ /*
+ * Validate that the task which armed the timer is still on the
+ * CPU. It could have been scheduled out without canceling the
+ * timer.
+ */
+ if (st->cookie == current && current->rseq.slice.state.granted) {
+ rseq_stat_inc(rseq_stats.s_expired);
+ set_need_resched_current();
+ }
+ return HRTIMER_NORESTART;
+}
+
+bool __rseq_arm_slice_extension_timer(void)
+{
+ struct slice_timer *st = this_cpu_ptr(&slice_timer);
+ struct task_struct *curr = current;
+
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * This check prevents a task, which got a time slice extension
+ * granted, from exceeding the maximum scheduling latency when the
+ * grant expired before going out to user space. Don't bother to
+ * clear the grant here, it will be cleaned up automatically before
+ * going out to user space after being scheduled back in.
+ */
+ if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) {
+ set_need_resched_current();
+ return true;
+ }
+
+ /*
+ * Store the task pointer as a cookie for comparison in the timer
+ * function. This is safe as the timer is CPU local and cannot be
+ * in the expiry function at this point.
+ */
+ st->cookie = curr;
+ hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD);
+ /* Arm the syscall entry work */
+ set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
+ return false;
+}
+
+static void rseq_cancel_slice_extension_timer(void)
+{
+ struct slice_timer *st = this_cpu_ptr(&slice_timer);
+
+ /*
+ * st->cookie can be safely read as preemption is disabled and the
+ * timer is CPU local.
+ *
+ * As this is most probably the first expiring timer, the cancel is
+ * expensive as it has to reprogram the hardware, but that's less
+ * expensive than going through a full hrtimer_interrupt() cycle
+ * for nothing.
+ *
+ * hrtimer_try_to_cancel() is sufficient here as the timer is CPU
+ * local and once the hrtimer code disabled interrupts the timer
+ * callback cannot be running.
+ */
+ if (st->cookie == current)
+ hrtimer_try_to_cancel(&st->timer);
+}
+
+static inline void rseq_slice_set_need_resched(struct task_struct *curr)
+{
+ /*
+ * The interrupt guard is required to prevent inconsistent state in
+ * this case:
+ *
+ * set_tsk_need_resched()
+ * --> Interrupt
+ * wakeup()
+ * set_tsk_need_resched()
+ * set_preempt_need_resched()
+ * schedule_on_return()
+ * clear_tsk_need_resched()
+ * clear_preempt_need_resched()
+ * set_preempt_need_resched() <- Inconsistent state
+ *
+ * This is safe vs. a remote set of TIF_NEED_RESCHED because that
+ * only sets the already set bit and does not create inconsistent
+ * state.
+ */
+ scoped_guard(irq)
+ set_need_resched_current();
+}
+
+static void rseq_slice_validate_ctrl(u32 expected)
+{
+ u32 __user *sctrl = &current->rseq.usrptr->slice_ctrl.all;
+ u32 uval;
+
+ if (get_user(uval, sctrl) || uval != expected)
+ force_sig(SIGSEGV);
+}
+
+/*
+ * Invoked from syscall entry if a time slice extension was granted and the
+ * kernel did not clear it before user space left the critical section.
+ *
+ * While the recommended way to relinquish the CPU side effect free is
+ * rseq_slice_yield(2), any syscall within a granted slice terminates the
+ * grant and immediately reschedules if required. This supports onion layer
+ * applications, where the code requesting the grant cannot control the
+ * code within the critical section.
+ */
+void rseq_syscall_enter_work(long syscall)
+{
+ struct task_struct *curr = current;
+ struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted };
+
+ clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
+
+ if (static_branch_unlikely(&rseq_debug_enabled))
+ rseq_slice_validate_ctrl(ctrl.all);
+
+ /*
+ * The kernel might have raced, revoked the grant and updated
+ * userspace, but kept the SLICE work set.
+ */
+ if (!ctrl.granted)
+ return;
+
+ /*
+ * Required to stabilize the per CPU timer pointer and to make
+ * set_tsk_need_resched() correct on PREEMPT[RT] kernels.
+ *
+ * Leaving the scope will reschedule on preemption models FULL,
+ * LAZY and RT if necessary.
+ */
+ scoped_guard(preempt) {
+ rseq_cancel_slice_extension_timer();
+ /*
+ * Now that preemption is disabled, quickly check whether
+ * the task was already rescheduled before arriving here.
+ */
+ if (!curr->rseq.event.sched_switch) {
+ rseq_slice_set_need_resched(curr);
+
+ if (syscall == __NR_rseq_slice_yield) {
+ rseq_stat_inc(rseq_stats.s_yielded);
+ /* Update the yielded state for syscall return */
+ curr->rseq.slice.yielded = 1;
+ } else {
+ rseq_stat_inc(rseq_stats.s_aborted);
+ }
+ }
+ }
+ /* Reschedule on NONE/VOLUNTARY preemption models */
+ cond_resched();
+
+ /* Clear the grant in kernel state and user space */
+ curr->rseq.slice.state.granted = false;
+ if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all))
+ force_sig(SIGSEGV);
+}
+
+int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
+{
+ switch (arg2) {
+ case PR_RSEQ_SLICE_EXTENSION_GET:
+ if (arg3)
+ return -EINVAL;
+ return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0;
+
+ case PR_RSEQ_SLICE_EXTENSION_SET: {
+ u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
+ bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE);
+
+ if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE)
+ return -EINVAL;
+ if (!rseq_slice_extension_enabled())
+ return -ENOTSUPP;
+ if (!current->rseq.usrptr)
+ return -ENXIO;
+
+ /* No change? */
+ if (enable == !!current->rseq.slice.state.enabled)
+ return 0;
+
+ if (get_user(rflags, &current->rseq.usrptr->flags))
+ goto die;
+
+ if (current->rseq.slice.state.enabled)
+ valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+
+ if ((rflags & valid) != valid)
+ goto die;
+
+ rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+ rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
+ if (enable)
+ rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+
+ if (put_user(rflags, &current->rseq.usrptr->flags))
+ goto die;
+
+ current->rseq.slice.state.enabled = enable;
+ return 0;
+ }
+ default:
+ return -EINVAL;
+ }
+die:
+ force_sig(SIGSEGV);
+ return -EFAULT;
+}
+
+/**
+ * sys_rseq_slice_yield - yield the current processor side effect free if a
+ * task granted with a time slice extension is done with
+ * the critical work before being forced out.
+ *
+ * Return: 1 if the task successfully yielded the CPU within the granted slice.
+ * 0 if the slice extension was either never granted or was revoked by
+ * going over the granted extension, using a syscall other than this one
+ * or being scheduled out earlier due to a subsequent interrupt.
+ *
+ * The syscall does not schedule because the syscall entry work immediately
+ * relinquishes the CPU and schedules if required.
+ */
+SYSCALL_DEFINE0(rseq_slice_yield)
+{
+ int yielded = !!current->rseq.slice.yielded;
+
+ current->rseq.slice.yielded = 0;
+ return yielded;
+}
+
+static int rseq_slice_ext_show(struct seq_file *m, void *p)
+{
+ seq_printf(m, "%d\n", rseq_slice_ext_nsecs);
+ return 0;
+}
+
+static ssize_t rseq_slice_ext_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ unsigned int nsecs;
+
+ if (kstrtouint_from_user(ubuf, count, 10, &nsecs))
+ return -EINVAL;
+
+ if (nsecs < rseq_slice_ext_nsecs_min)
+ return -ERANGE;
+
+ if (nsecs > rseq_slice_ext_nsecs_max)
+ return -ERANGE;
+
+ rseq_slice_ext_nsecs = nsecs;
+
+ return count;
+}
+
+static int rseq_slice_ext_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rseq_slice_ext_show, inode->i_private);
+}
+
+static const struct file_operations slice_ext_ops = {
+ .open = rseq_slice_ext_open,
+ .read = seq_read,
+ .write = rseq_slice_ext_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void rseq_slice_ext_init(struct dentry *root_dir)
+{
+ debugfs_create_file("slice_ext_nsec", 0644, root_dir, NULL, &slice_ext_ops);
+}
+
+static int __init rseq_slice_cmdline(char *str)
+{
+ bool on;
+
+ if (kstrtobool(str, &on))
+ return 0;
+
+ if (!on)
+ static_branch_disable(&rseq_slice_extension_key);
+ return 1;
+}
+__setup("rseq_slice_ext=", rseq_slice_cmdline);
+
+static int __init rseq_slice_init(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired,
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD);
+ }
+ return 0;
+}
+device_initcall(rseq_slice_init);
+#else
+static void rseq_slice_ext_init(struct dentry *root_dir) { }
+#endif /* CONFIG_RSEQ_SLICE_EXTENSION */
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 8ae86371ddcd..b1f1a367034f 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,5 +1,8 @@
# SPDX-License-Identifier: GPL-2.0
+CONTEXT_ANALYSIS_core.o := y
+CONTEXT_ANALYSIS_fair.o := y
+
# The compilers are complaining about unused variables inside an if(0) scope
# block. This is daft, shut them up.
ccflags-y += $(call cc-disable-warning, unused-but-set-variable)
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index f5e6dd6a6b3a..2ae4fbf13431 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -173,6 +173,7 @@ notrace static void __sched_clock_work(struct work_struct *work)
scd->tick_gtod, __gtod_offset,
scd->tick_raw, __sched_clock_offset);
+ disable_sched_clock_irqtime();
static_branch_disable(&__sched_clock_stable);
}
@@ -238,6 +239,8 @@ static int __init sched_clock_init_late(void)
if (__sched_clock_stable_early)
__set_sched_clock_stable();
+ else
+ disable_sched_clock_irqtime(); /* disable if clock unstable. */
return 0;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 045f83ad261e..759777694c78 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,6 +119,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_entry_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_exit_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_set_need_resched_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
@@ -396,6 +399,8 @@ static atomic_t sched_core_count;
static struct cpumask sched_core_mask;
static void sched_core_lock(int cpu, unsigned long *flags)
+ __context_unsafe(/* acquires multiple */)
+ __acquires(&runqueues.__lock) /* overapproximation */
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
int t, i = 0;
@@ -406,6 +411,8 @@ static void sched_core_lock(int cpu, unsigned long *flags)
}
static void sched_core_unlock(int cpu, unsigned long *flags)
+ __context_unsafe(/* releases multiple */)
+ __releases(&runqueues.__lock) /* overapproximation */
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
int t;
@@ -630,6 +637,7 @@ EXPORT_SYMBOL(__trace_set_current_state);
*/
void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
+ __context_unsafe()
{
raw_spinlock_t *lock;
@@ -655,6 +663,7 @@ void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
}
bool raw_spin_rq_trylock(struct rq *rq)
+ __context_unsafe()
{
raw_spinlock_t *lock;
bool ret;
@@ -696,15 +705,16 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
raw_spin_rq_lock(rq1);
if (__rq_lockp(rq1) != __rq_lockp(rq2))
raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+ else
+ __acquire_ctx_lock(__rq_lockp(rq2)); /* fake acquire */
double_rq_clock_clear_update(rq1, rq2);
}
/*
- * __task_rq_lock - lock the rq @p resides on.
+ * ___task_rq_lock - lock the rq @p resides on.
*/
-struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
- __acquires(rq->lock)
+struct rq *___task_rq_lock(struct task_struct *p, struct rq_flags *rf)
{
struct rq *rq;
@@ -727,9 +737,7 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
/*
* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
*/
-struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
- __acquires(p->pi_lock)
- __acquires(rq->lock)
+struct rq *_task_rq_lock(struct task_struct *p, struct rq_flags *rf)
{
struct rq *rq;
@@ -770,6 +778,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
* RQ-clock updating methods:
*/
+/* Use CONFIG_PARAVIRT as this will avoid more #ifdef in arch code. */
+#ifdef CONFIG_PARAVIRT
+struct static_key paravirt_steal_rq_enabled;
+#endif
+
static void update_rq_clock_task(struct rq *rq, s64 delta)
{
/*
@@ -1136,6 +1149,7 @@ void __trace_set_need_resched(struct task_struct *curr, int tif)
{
trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif);
}
+EXPORT_SYMBOL_GPL(__trace_set_need_resched);
void resched_curr(struct rq *rq)
{
@@ -2090,7 +2104,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
*/
uclamp_rq_inc(rq, p, flags);
- rq->queue_mask |= p->sched_class->queue_mask;
p->sched_class->enqueue_task(rq, p, flags);
psi_enqueue(p, flags);
@@ -2123,7 +2136,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
* and mark the task ->sched_delayed.
*/
uclamp_rq_dec(rq, p);
- rq->queue_mask |= p->sched_class->queue_mask;
return p->sched_class->dequeue_task(rq, p, flags);
}
@@ -2174,10 +2186,14 @@ void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *donor = rq->donor;
- if (p->sched_class == donor->sched_class)
- donor->sched_class->wakeup_preempt(rq, p, flags);
- else if (sched_class_above(p->sched_class, donor->sched_class))
+ if (p->sched_class == rq->next_class) {
+ rq->next_class->wakeup_preempt(rq, p, flags);
+
+ } else if (sched_class_above(p->sched_class, rq->next_class)) {
+ rq->next_class->wakeup_preempt(rq, p, flags);
resched_curr(rq);
+ rq->next_class = p->sched_class;
+ }
/*
* A queue event has occurred, and we're going to schedule. In
@@ -2431,6 +2447,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
*/
static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int new_cpu)
+ __must_hold(__rq_lockp(rq))
{
lockdep_assert_rq_held(rq);
@@ -2477,6 +2494,7 @@ struct set_affinity_pending {
*/
static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int dest_cpu)
+ __must_hold(__rq_lockp(rq))
{
/* Affinity changed (again). */
if (!is_cpu_allowed(p, dest_cpu))
@@ -2513,6 +2531,12 @@ static int migration_cpu_stop(void *data)
*/
flush_smp_call_function_queue();
+ /*
+ * We may change the underlying rq, but the locks held will
+ * appropriately be "transferred" when switching.
+ */
+ context_unsafe_alias(rq);
+
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
@@ -2624,6 +2648,8 @@ int push_cpu_stop(void *arg)
if (!lowest_rq)
goto out_unlock;
+ lockdep_assert_rq_held(lowest_rq);
+
// XXX validate p is still the highest prio task
if (task_rq(p) == rq) {
move_queued_task_locked(rq, lowest_rq, p);
@@ -2834,8 +2860,7 @@ void release_user_cpus_ptr(struct task_struct *p)
*/
static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
int dest_cpu, unsigned int flags)
- __releases(rq->lock)
- __releases(p->pi_lock)
+ __releases(__rq_lockp(rq), &p->pi_lock)
{
struct set_affinity_pending my_pending = { }, *pending = NULL;
bool stop_pending, complete = false;
@@ -2990,8 +3015,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
struct affinity_context *ctx,
struct rq *rq,
struct rq_flags *rf)
- __releases(rq->lock)
- __releases(p->pi_lock)
+ __releases(__rq_lockp(rq), &p->pi_lock)
{
const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
const struct cpumask *cpu_valid_mask = cpu_active_mask;
@@ -3607,6 +3631,18 @@ static inline void ttwu_do_wakeup(struct task_struct *p)
trace_sched_wakeup(p);
}
+void update_rq_avg_idle(struct rq *rq)
+{
+ u64 delta = rq_clock(rq) - rq->idle_stamp;
+ u64 max = 2*rq->max_idle_balance_cost;
+
+ update_avg(&rq->avg_idle, delta);
+
+ if (rq->avg_idle > max)
+ rq->avg_idle = max;
+ rq->idle_stamp = 0;
+}
+
static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
struct rq_flags *rf)
@@ -3642,18 +3678,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
p->sched_class->task_woken(rq, p);
rq_repin_lock(rq, rf);
}
-
- if (rq->idle_stamp) {
- u64 delta = rq_clock(rq) - rq->idle_stamp;
- u64 max = 2*rq->max_idle_balance_cost;
-
- update_avg(&rq->avg_idle, delta);
-
- if (rq->avg_idle > max)
- rq->avg_idle = max;
-
- rq->idle_stamp = 0;
- }
}
/*
@@ -4273,29 +4297,30 @@ static bool __task_needs_rq_lock(struct task_struct *p)
*/
int task_call_func(struct task_struct *p, task_call_f func, void *arg)
{
- struct rq *rq = NULL;
struct rq_flags rf;
int ret;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
- if (__task_needs_rq_lock(p))
- rq = __task_rq_lock(p, &rf);
+ if (__task_needs_rq_lock(p)) {
+ struct rq *rq = __task_rq_lock(p, &rf);
- /*
- * At this point the task is pinned; either:
- * - blocked and we're holding off wakeups (pi->lock)
- * - woken, and we're holding off enqueue (rq->lock)
- * - queued, and we're holding off schedule (rq->lock)
- * - running, and we're holding off de-schedule (rq->lock)
- *
- * The called function (@func) can use: task_curr(), p->on_rq and
- * p->__state to differentiate between these states.
- */
- ret = func(p, arg);
+ /*
+ * At this point the task is pinned; either:
+ * - blocked and we're holding off wakeups (pi->lock)
+ * - woken, and we're holding off enqueue (rq->lock)
+ * - queued, and we're holding off schedule (rq->lock)
+ * - running, and we're holding off de-schedule (rq->lock)
+ *
+ * The called function (@func) can use: task_curr(), p->on_rq and
+ * p->__state to differentiate between these states.
+ */
+ ret = func(p, arg);
- if (rq)
__task_rq_unlock(rq, p, &rf);
+ } else {
+ ret = func(p, arg);
+ }
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return ret;
@@ -4972,6 +4997,8 @@ void balance_callbacks(struct rq *rq, struct balance_callback *head)
static inline void
prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
+ __releases(__rq_lockp(rq))
+ __acquires(__rq_lockp(this_rq()))
{
/*
* Since the runqueue lock will be released by the next
@@ -4985,9 +5012,15 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf
/* this is a valid case when another task releases the spinlock */
rq_lockp(rq)->owner = next;
#endif
+ /*
+ * Model the rq reference switcheroo.
+ */
+ __release(__rq_lockp(rq));
+ __acquire(__rq_lockp(this_rq()));
}
static inline void finish_lock_switch(struct rq *rq)
+ __releases(__rq_lockp(rq))
{
/*
* If we are tracking spinlock dependencies then we have to
@@ -5043,6 +5076,7 @@ static inline void kmap_local_sched_in(void)
static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
+ __must_hold(__rq_lockp(rq))
{
kcov_prepare_switch(prev);
sched_info_switch(rq, prev, next);
@@ -5073,7 +5107,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
* because prev may have moved to another CPU.
*/
static struct rq *finish_task_switch(struct task_struct *prev)
- __releases(rq->lock)
+ __releases(__rq_lockp(this_rq()))
{
struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
@@ -5169,7 +5203,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* @prev: the thread we just switched away from.
*/
asmlinkage __visible void schedule_tail(struct task_struct *prev)
- __releases(rq->lock)
+ __releases(__rq_lockp(this_rq()))
{
/*
* New tasks start with FORK_PREEMPT_COUNT, see there and
@@ -5201,6 +5235,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
+ __releases(__rq_lockp(rq))
{
prepare_task_switch(rq, prev, next);
@@ -5869,6 +5904,7 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
*/
static inline struct task_struct *
__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ __must_hold(__rq_lockp(rq))
{
const struct sched_class *class;
struct task_struct *p;
@@ -5969,6 +6005,7 @@ static void queue_core_balance(struct rq *rq);
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ __must_hold(__rq_lockp(rq))
{
struct task_struct *next, *p, *max;
const struct cpumask *smt_mask;
@@ -6277,6 +6314,7 @@ static bool steal_cookie_task(int cpu, struct sched_domain *sd)
}
static void sched_core_balance(struct rq *rq)
+ __must_hold(__rq_lockp(rq))
{
struct sched_domain *sd;
int cpu = cpu_of(rq);
@@ -6422,6 +6460,7 @@ static inline void sched_core_cpu_dying(unsigned int cpu) {}
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ __must_hold(__rq_lockp(rq))
{
return __pick_next_task(rq, prev, rf);
}
@@ -6808,6 +6847,7 @@ static void __sched notrace __schedule(int sched_mode)
pick_again:
next = pick_next_task(rq, rq->donor, &rf);
rq_set_donor(rq, next);
+ rq->next_class = next->sched_class;
if (unlikely(task_is_blocked(next))) {
next = find_proxy_task(rq, next, &rf);
if (!next)
@@ -7552,7 +7592,7 @@ int preempt_dynamic_mode = preempt_dynamic_undefined;
int sched_dynamic_mode(const char *str)
{
-# ifndef CONFIG_PREEMPT_RT
+# if !(defined(CONFIG_PREEMPT_RT) || defined(CONFIG_ARCH_HAS_PREEMPT_LAZY))
if (!strcmp(str, "none"))
return preempt_dynamic_none;
@@ -8045,6 +8085,12 @@ static int __balance_push_cpu_stop(void *arg)
int cpu;
scoped_guard (raw_spinlock_irq, &p->pi_lock) {
+ /*
+ * We may change the underlying rq, but the locks held will
+ * appropriately be "transferred" when switching.
+ */
+ context_unsafe_alias(rq);
+
cpu = select_fallback_rq(rq->cpu, p);
rq_lock(rq, &rf);
@@ -8068,6 +8114,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
* effective when the hotplug motion is down.
*/
static void balance_push(struct rq *rq)
+ __must_hold(__rq_lockp(rq))
{
struct task_struct *push_task = rq->curr;
@@ -8477,6 +8524,9 @@ int sched_cpu_dying(unsigned int cpu)
dump_rq_tasks(rq, KERN_WARNING);
}
dl_server_stop(&rq->fair_server);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ dl_server_stop(&rq->ext_server);
+#endif
rq_unlock_irqrestore(rq, &rf);
calc_load_migrate(rq);
@@ -8652,6 +8702,8 @@ void __init sched_init(void)
rq->rt.rt_runtime = global_rt_runtime();
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
+ rq->next_class = &idle_sched_class;
+
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = SCHED_CAPACITY_SCALE;
@@ -8680,6 +8732,9 @@ void __init sched_init(void)
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
fair_server_init(rq);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ ext_server_init(rq);
+#endif
#ifdef CONFIG_SCHED_CORE
rq->core = rq;
@@ -9111,6 +9166,7 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
bool resched = false;
+ bool queued = false;
struct rq *rq;
CLASS(task_rq_lock, rq_guard)(tsk);
@@ -9122,10 +9178,13 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
scx_cgroup_move_task(tsk);
if (scope->running)
resched = true;
+ queued = scope->queued;
}
if (resched)
resched_curr(rq);
+ else if (queued)
+ wakeup_preempt(rq, tsk, 0);
__balance_callbacks(rq, &rq_guard.rf);
}
@@ -10269,7 +10328,8 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
* Serialization rules:
*
* mm::mm_cid::mutex: Serializes fork() and exit() and therefore
- * protects mm::mm_cid::users.
+ * protects mm::mm_cid::users and mode switch
+ * transitions
*
* mm::mm_cid::lock: Serializes mm_update_max_cids() and
* mm_update_cpus_allowed(). Nests in mm_cid::mutex
@@ -10285,14 +10345,70 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
*
* A CID is either owned by a task (stored in task_struct::mm_cid.cid) or
* by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the
- * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode,
- * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the
- * task needs to drop the CID into the pool when scheduling out. Both bits
- * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is
- * actually handed over to user space in the RSEQ memory.
+ * MM_CID_ONCPU bit set.
+ *
+ * During the transition of ownership mode, the MM_CID_TRANSIT bit is set
+ * on the CIDs. When this bit is set the tasks drop the CID back into the
+ * pool when scheduling out.
+ *
+ * Both bits (ONCPU and TRANSIT) are filtered out by task_cid() when the
+ * CID is actually handed over to user space in the RSEQ memory.
*
* Mode switching:
*
+ * The ownership mode is per process and stored in mm:mm_cid::mode with the
+ * following possible states:
+ *
+ * 0: Per task ownership
+ * 0 | MM_CID_TRANSIT: Transition from per CPU to per task
+ * MM_CID_ONCPU: Per CPU ownership
+ * MM_CID_ONCPU | MM_CID_TRANSIT: Transition from per task to per CPU
+ *
+ * All transitions of ownership mode happen in two phases:
+ *
+ * 1) mm:mm_cid::mode has the MM_CID_TRANSIT bit set. This is OR'ed on the
+ * CIDs and denotes that the CID is only temporarily owned by a
+ * task. When the task schedules out it drops the CID back into the
+ * pool if this bit is set.
+ *
+ * 2) The initiating context walks the per CPU space or the tasks to fixup
+ * or drop the CIDs and after completion it clears MM_CID_TRANSIT in
+ * mm:mm_cid::mode. After that point the CIDs are strictly task or CPU
+ * owned again.
+ *
+ * This two phase transition is required to prevent CID space exhaustion
+ * during the transition as a direct transfer of ownership would fail:
+ *
+ * - On task to CPU mode switch if a task is scheduled in on one CPU and
+ * then migrated to another CPU before the fixup freed enough per task
+ * CIDs.
+ *
+ * - On CPU to task mode switch if two tasks are scheduled in on the same
+ * CPU before the fixup freed per CPU CIDs.
+ *
+ * Both scenarios can result in a live lock because sched_in() is invoked
+ * with runqueue lock held and loops in search of a CID and the fixup
+ * thread can't make progress freeing them up because it is stuck on the
+ * same runqueue lock.
+ *
+ * While MM_CID_TRANSIT is active during the transition phase the MM_CID
+ * bitmap can be contended, but that's a temporary contention bound to the
+ * transition period. After that everything goes back into steady state and
+ * nothing except fork() and exit() will touch the bitmap. This is an
+ * acceptable tradeoff as it completely avoids complex serialization,
+ * memory barriers and atomic operations for the common case.
+ *
+ * Aside of that this mechanism also ensures RT compability:
+ *
+ * - The task which runs the fixup is fully preemptible except for the
+ * short runqueue lock held sections.
+ *
+ * - The transient impact of the bitmap contention is only problematic
+ * when there is a thundering herd scenario of tasks scheduling in and
+ * out concurrently. There is not much which can be done about that
+ * except for avoiding mode switching by a proper overall system
+ * configuration.
+ *
* Switching to per CPU mode happens when the user count becomes greater
* than the maximum number of CIDs, which is calculated by:
*
@@ -10306,12 +10422,13 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
*
* At the point of switching to per CPU mode the new user is not yet
* visible in the system, so the task which initiated the fork() runs the
- * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and
- * either transfers each tasks owned CID to the CPU the task runs on or
- * drops it into the CID pool if a task is not on a CPU at that point in
- * time. Tasks which schedule in before the task walk reaches them do the
- * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes
- * it's guaranteed that no task related to that MM owns a CID anymore.
+ * fixup function. mm_cid_fixup_tasks_to_cpu() walks the thread list and
+ * either marks each task owned CID with MM_CID_TRANSIT if the task is
+ * running on a CPU or drops it into the CID pool if a task is not on a
+ * CPU. Tasks which schedule in before the task walk reaches them do the
+ * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus()
+ * completes it is guaranteed that no task related to that MM owns a CID
+ * anymore.
*
* Switching back to task mode happens when the user count goes below the
* threshold which was recorded on the per CPU mode switch:
@@ -10327,28 +10444,11 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
* run either in the deferred update function in context of a workqueue or
* by a task which forks a new one or by a task which exits. Whatever
* happens first. mm_cid_fixup_cpus_to_task() walks through the possible
- * CPUs and either transfers the CPU owned CIDs to a related task which
- * runs on the CPU or drops it into the pool. Tasks which schedule in on a
- * CPU which the walk did not cover yet do the handover themself.
- *
- * This transition from CPU to per task ownership happens in two phases:
- *
- * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task
- * CID and denotes that the CID is only temporarily owned by the
- * task. When it schedules out the task drops the CID back into the
- * pool if this bit is set.
- *
- * 2) The initiating context walks the per CPU space and after completion
- * clears mm:mm_cid.transit. So after that point the CIDs are strictly
- * task owned again.
- *
- * This two phase transition is required to prevent CID space exhaustion
- * during the transition as a direct transfer of ownership would fail if
- * two tasks are scheduled in on the same CPU before the fixup freed per
- * CPU CIDs.
- *
- * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
- * related to that MM is owned by a CPU anymore.
+ * CPUs and either marks the CPU owned CIDs with MM_CID_TRANSIT if a
+ * related task is running on the CPU or drops it into the pool. Tasks
+ * which are scheduled in before the fixup covered them do the handover
+ * themself. When mm_cid_fixup_cpus_to_tasks() completes it is guaranteed
+ * that no CID related to that MM is owned by a CPU anymore.
*/
/*
@@ -10379,6 +10479,7 @@ static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc)
static bool mm_update_max_cids(struct mm_struct *mm)
{
struct mm_mm_cid *mc = &mm->mm_cid;
+ bool percpu = cid_on_cpu(mc->mode);
lockdep_assert_held(&mm->mm_cid.lock);
@@ -10387,7 +10488,7 @@ static bool mm_update_max_cids(struct mm_struct *mm)
__mm_update_max_cids(mc);
/* Check whether owner mode must be changed */
- if (!mc->percpu) {
+ if (!percpu) {
/* Enable per CPU mode when the number of users is above max_cids */
if (mc->users > mc->max_cids)
mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
@@ -10398,12 +10499,17 @@ static bool mm_update_max_cids(struct mm_struct *mm)
}
/* Mode change required? */
- if (!!mc->percpu == !!mc->pcpu_thrs)
+ if (percpu == !!mc->pcpu_thrs)
return false;
- /* When switching back to per TASK mode, set the transition flag */
- if (!mc->pcpu_thrs)
- WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
- WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs);
+
+ /* Flip the mode and set the transition flag to bridge the transfer */
+ WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU));
+ /*
+ * Order the store against the subsequent fixups so that
+ * acquire(rq::lock) cannot be reordered by the CPU before the
+ * store.
+ */
+ smp_mb();
return true;
}
@@ -10428,7 +10534,7 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu
WRITE_ONCE(mc->nr_cpus_allowed, weight);
__mm_update_max_cids(mc);
- if (!mc->percpu)
+ if (!cid_on_cpu(mc->mode))
return;
/* Adjust the threshold to the wider set */
@@ -10446,6 +10552,16 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu
irq_work_queue(&mc->irq_work);
}
+static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int mode)
+{
+ /*
+ * Ensure that the store removing the TRANSIT bit cannot be
+ * reordered by the CPU before the fixups have been completed.
+ */
+ smp_mb();
+ WRITE_ONCE(mm->mm_cid.mode, mode);
+}
+
static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
{
if (cid_on_cpu(t->mm_cid.cid)) {
@@ -10489,14 +10605,13 @@ static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
}
}
}
- /* Clear the transition bit */
- WRITE_ONCE(mm->mm_cid.transit, 0);
+ mm_cid_complete_transit(mm, 0);
}
-static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
+static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
{
if (cid_on_task(t->mm_cid.cid)) {
- t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid);
+ t->mm_cid.cid = cid_to_transit_cid(t->mm_cid.cid);
pcp->cid = t->mm_cid.cid;
}
}
@@ -10509,18 +10624,17 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm
if (!t->mm_cid.active)
return false;
if (cid_on_task(t->mm_cid.cid)) {
- /* If running on the CPU, transfer the CID, otherwise drop it */
+ /* If running on the CPU, put the CID in transit mode, otherwise drop it */
if (task_rq(t)->curr == t)
- mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
+ mm_cid_transit_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
else
mm_unset_cid_on_task(t);
}
return true;
}
-static void mm_cid_fixup_tasks_to_cpus(void)
+static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
{
- struct mm_struct *mm = current->mm;
struct task_struct *p, *t;
unsigned int users;
@@ -10558,6 +10672,14 @@ static void mm_cid_fixup_tasks_to_cpus(void)
}
}
+static void mm_cid_fixup_tasks_to_cpus(void)
+{
+ struct mm_struct *mm = current->mm;
+
+ mm_cid_do_fixup_tasks_to_cpus(mm);
+ mm_cid_complete_transit(mm, MM_CID_ONCPU);
+}
+
static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
{
t->mm_cid.active = 1;
@@ -10586,17 +10708,17 @@ void sched_mm_cid_fork(struct task_struct *t)
}
if (!sched_mm_cid_add_user(t, mm)) {
- if (!mm->mm_cid.percpu)
+ if (!cid_on_cpu(mm->mm_cid.mode))
t->mm_cid.cid = mm_get_cid(mm);
return;
}
/* Handle the mode change and transfer current's CID */
- percpu = !!mm->mm_cid.percpu;
+ percpu = cid_on_cpu(mm->mm_cid.mode);
if (!percpu)
mm_cid_transit_to_task(current, pcp);
else
- mm_cid_transfer_to_cpu(current, pcp);
+ mm_cid_transit_to_cpu(current, pcp);
}
if (percpu) {
@@ -10631,7 +10753,7 @@ static bool __sched_mm_cid_exit(struct task_struct *t)
* affinity change increased the number of allowed CPUs and the
* deferred fixup did not run yet.
*/
- if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode)))
return false;
/*
* A failed fork(2) cleanup never gets here, so @current must have
@@ -10664,8 +10786,13 @@ void sched_mm_cid_exit(struct task_struct *t)
scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
if (!__sched_mm_cid_exit(t))
return;
- /* Mode change required. Transfer currents CID */
- mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu));
+ /*
+ * Mode change. The task has the CID unset
+ * already and dealt with an eventually set
+ * TRANSIT bit. If the CID is owned by the CPU
+ * then drop it.
+ */
+ mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu));
}
mm_cid_fixup_cpus_to_tasks(mm);
return;
@@ -10722,7 +10849,7 @@ static void mm_cid_work_fn(struct work_struct *work)
if (!mm_update_max_cids(mm))
return;
/* Affinity changes can only switch back to task mode */
- if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode)))
return;
}
mm_cid_fixup_cpus_to_tasks(mm);
@@ -10743,8 +10870,7 @@ static void mm_cid_irq_work(struct irq_work *work)
void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
{
mm->mm_cid.max_cids = 0;
- mm->mm_cid.percpu = 0;
- mm->mm_cid.transit = 0;
+ mm->mm_cid.mode = 0;
mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
mm->mm_cid.users = 0;
mm->mm_cid.pcpu_thrs = 0;
@@ -10780,13 +10906,12 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int
flags |= DEQUEUE_NOCLOCK;
}
- if (flags & DEQUEUE_CLASS) {
- if (p->sched_class->switching_from)
- p->sched_class->switching_from(rq, p);
- }
+ if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from)
+ p->sched_class->switching_from(rq, p);
*ctx = (struct sched_change_ctx){
.p = p,
+ .class = p->sched_class,
.flags = flags,
.queued = task_on_rq_queued(p),
.running = task_current_donor(rq, p),
@@ -10817,6 +10942,11 @@ void sched_change_end(struct sched_change_ctx *ctx)
lockdep_assert_rq_held(rq);
+ /*
+ * Changing class without *QUEUE_CLASS is bad.
+ */
+ WARN_ON_ONCE(p->sched_class != ctx->class && !(ctx->flags & ENQUEUE_CLASS));
+
if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
p->sched_class->switching_to(rq, p);
@@ -10828,6 +10958,25 @@ void sched_change_end(struct sched_change_ctx *ctx)
if (ctx->flags & ENQUEUE_CLASS) {
if (p->sched_class->switched_to)
p->sched_class->switched_to(rq, p);
+
+ if (ctx->running) {
+ /*
+ * If this was a class promotion; let the old class
+ * know it got preempted. Note that none of the
+ * switch*_from() methods know the new class and none
+ * of the switch*_to() methods know the old class.
+ */
+ if (sched_class_above(p->sched_class, ctx->class)) {
+ rq->next_class->wakeup_preempt(rq, p, 0);
+ rq->next_class = p->sched_class;
+ }
+ /*
+ * If this was a degradation in class; make sure to
+ * reschedule.
+ */
+ if (sched_class_above(ctx->class, p->sched_class))
+ resched_curr(rq);
+ }
} else {
p->sched_class->prio_changed(rq, p, ctx->prio);
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 0ab5f9d4bc59..cfc40181f66e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -682,7 +682,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
"sugov:%d",
cpumask_first(policy->related_cpus));
if (IS_ERR(thread)) {
- pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
+ pr_err("failed to create sugov thread: %pe\n", thread);
return PTR_ERR(thread);
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 4f97896887ec..fbf31db0d2f3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -12,6 +12,8 @@
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime);
+
/*
* There are no locks covering percpu hardirq/softirq time.
* They are only modified in vtime_account, on corresponding CPU
@@ -25,16 +27,15 @@
*/
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
-int sched_clock_irqtime;
-
void enable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 1;
+ static_branch_enable(&sched_clock_irqtime);
}
void disable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 0;
+ if (irqtime_enabled())
+ static_branch_disable(&sched_clock_irqtime);
}
static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
@@ -251,6 +252,19 @@ void __account_forceidle_time(struct task_struct *p, u64 delta)
* ticks are not redelivered later. Due to that, this function may on
* occasion account more time than the calling functions think elapsed.
*/
+#ifdef CONFIG_PARAVIRT
+struct static_key paravirt_steal_enabled;
+
+#ifdef CONFIG_HAVE_PV_STEAL_CLOCK_GEN
+static u64 native_steal_clock(int cpu)
+{
+ return 0;
+}
+
+DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock);
+#endif
+#endif
+
static __always_inline u64 steal_account_process_time(u64 maxtime)
{
#ifdef CONFIG_PARAVIRT
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index c509f2e7d69d..d08b00429323 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1034,6 +1034,12 @@ static void update_dl_entity(struct sched_dl_entity *dl_se)
return;
}
+ /*
+ * When [4] D->A is followed by [1] A->B, dl_defer_running
+ * needs to be cleared, otherwise it will fail to properly
+ * start the zero-laxity timer.
+ */
+ dl_se->dl_defer_running = 0;
replenish_dl_new_period(dl_se, rq);
} else if (dl_server(dl_se) && dl_se->dl_defer) {
/*
@@ -1443,8 +1449,8 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
dl_se->dl_defer_idle = 0;
/*
- * The fair server can consume its runtime while throttled (not queued/
- * running as regular CFS).
+ * The DL server can consume its runtime while throttled (not
+ * queued / running as regular CFS).
*
* If the server consumes its entire runtime in this state. The server
* is not required for the current period. Thus, reset the server by
@@ -1529,10 +1535,10 @@ throttle:
}
/*
- * The fair server (sole dl_server) does not account for real-time
- * workload because it is running fair work.
+ * The dl_server does not account for real-time workload because it
+ * is running fair work.
*/
- if (dl_se == &rq->fair_server)
+ if (dl_se->dl_server)
return;
#ifdef CONFIG_RT_GROUP_SCHED
@@ -1567,9 +1573,9 @@ throttle:
* In the non-defer mode, the idle time is not accounted, as the
* server provides a guarantee.
*
- * If the dl_server is in defer mode, the idle time is also considered
- * as time available for the fair server, avoiding a penalty for the
- * rt scheduler that did not consumed that time.
+ * If the dl_server is in defer mode, the idle time is also considered as
+ * time available for the dl_server, avoiding a penalty for the rt
+ * scheduler that did not consumed that time.
*/
void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec)
{
@@ -1655,6 +1661,12 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
* dl_server_active = 1;
* enqueue_dl_entity()
* update_dl_entity(WAKEUP)
+ * if (dl_time_before() || dl_entity_overflow())
+ * dl_defer_running = 0;
+ * replenish_dl_new_period();
+ * // fwd period
+ * dl_throttled = 1;
+ * dl_defer_armed = 1;
* if (!dl_defer_running)
* dl_defer_armed = 1;
* dl_throttled = 1;
@@ -1787,7 +1799,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
struct rq *rq = dl_se->rq;
dl_se->dl_defer_idle = 0;
- if (!dl_server(dl_se) || dl_se->dl_server_active)
+ if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime)
return;
/*
@@ -1848,6 +1860,18 @@ void sched_init_dl_servers(void)
dl_se->dl_server = 1;
dl_se->dl_defer = 1;
setup_new_dl_entity(dl_se);
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ dl_se = &rq->ext_server;
+
+ WARN_ON(dl_server(dl_se));
+
+ dl_server_apply_params(dl_se, runtime, period, 1);
+
+ dl_se->dl_server = 1;
+ dl_se->dl_defer = 1;
+ setup_new_dl_entity(dl_se);
+#endif
}
}
@@ -1874,7 +1898,6 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio
int cpu = cpu_of(rq);
struct dl_bw *dl_b;
unsigned long cap;
- int retval = 0;
int cpus;
dl_b = dl_bw_of(cpu);
@@ -1906,7 +1929,7 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
- return retval;
+ return 0;
}
/*
@@ -2503,9 +2526,16 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
* Only called when both the current and waking task are -deadline
* tasks.
*/
-static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
- int flags)
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags)
{
+ /*
+ * Can only get preempted by stop-class, and those should be
+ * few and short lived, doesn't really make sense to push
+ * anything away for that.
+ */
+ if (p->sched_class != &dl_sched_class)
+ return;
+
if (dl_entity_preempt(&p->dl, &rq->donor->dl)) {
resched_curr(rq);
return;
@@ -3179,6 +3209,36 @@ void dl_add_task_root_domain(struct task_struct *p)
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
}
+static void dl_server_add_bw(struct root_domain *rd, int cpu)
+{
+ struct sched_dl_entity *dl_se;
+
+ dl_se = &cpu_rq(cpu)->fair_server;
+ if (dl_server(dl_se) && cpu_active(cpu))
+ __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ dl_se = &cpu_rq(cpu)->ext_server;
+ if (dl_server(dl_se) && cpu_active(cpu))
+ __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));
+#endif
+}
+
+static u64 dl_server_read_bw(int cpu)
+{
+ u64 dl_bw = 0;
+
+ if (cpu_rq(cpu)->fair_server.dl_server)
+ dl_bw += cpu_rq(cpu)->fair_server.dl_bw;
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (cpu_rq(cpu)->ext_server.dl_server)
+ dl_bw += cpu_rq(cpu)->ext_server.dl_bw;
+#endif
+
+ return dl_bw;
+}
+
void dl_clear_root_domain(struct root_domain *rd)
{
int i;
@@ -3197,12 +3257,8 @@ void dl_clear_root_domain(struct root_domain *rd)
* dl_servers are not tasks. Since dl_add_task_root_domain ignores
* them, we need to account for them here explicitly.
*/
- for_each_cpu(i, rd->span) {
- struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server;
-
- if (dl_server(dl_se) && cpu_active(i))
- __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i));
- }
+ for_each_cpu(i, rd->span)
+ dl_server_add_bw(rd, i);
}
void dl_clear_root_domain_cpu(int cpu)
@@ -3348,9 +3404,6 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu)
#endif
DEFINE_SCHED_CLASS(dl) = {
-
- .queue_mask = 8,
-
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
@@ -3644,6 +3697,9 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se)
dl_se->dl_non_contending = 0;
dl_se->dl_overrun = 0;
dl_se->dl_server = 0;
+ dl_se->dl_defer = 0;
+ dl_se->dl_defer_running = 0;
+ dl_se->dl_defer_armed = 0;
#ifdef CONFIG_RT_MUTEXES
dl_se->pi_se = dl_se;
@@ -3701,7 +3757,7 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
unsigned long flags, cap;
struct dl_bw *dl_b;
bool overflow = 0;
- u64 fair_server_bw = 0;
+ u64 dl_server_bw = 0;
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
@@ -3734,27 +3790,26 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
cap -= arch_scale_cpu_capacity(cpu);
/*
- * cpu is going offline and NORMAL tasks will be moved away
- * from it. We can thus discount dl_server bandwidth
- * contribution as it won't need to be servicing tasks after
- * the cpu is off.
+ * cpu is going offline and NORMAL and EXT tasks will be
+ * moved away from it. We can thus discount dl_server
+ * bandwidth contribution as it won't need to be servicing
+ * tasks after the cpu is off.
*/
- if (cpu_rq(cpu)->fair_server.dl_server)
- fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw;
+ dl_server_bw = dl_server_read_bw(cpu);
/*
* Not much to check if no DEADLINE bandwidth is present.
* dl_servers we can discount, as tasks will be moved out the
* offlined CPUs anyway.
*/
- if (dl_b->total_bw - fair_server_bw > 0) {
+ if (dl_b->total_bw - dl_server_bw > 0) {
/*
* Leaving at least one CPU for DEADLINE tasks seems a
* wise thing to do. As said above, cpu is not offline
* yet, so account for that.
*/
if (dl_bw_cpus(cpu) - 1)
- overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0);
+ overflow = __dl_overflow(dl_b, cap, dl_server_bw, 0);
else
overflow = 1;
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 41caa22e0680..b24f40f05019 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -172,18 +172,12 @@ static const struct file_operations sched_feat_fops = {
static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- char buf[16];
unsigned int scaling;
+ int ret;
- if (cnt > 15)
- cnt = 15;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
- buf[cnt] = '\0';
-
- if (kstrtouint(buf, 10, &scaling))
- return -EINVAL;
+ ret = kstrtouint_from_user(ubuf, cnt, 10, &scaling);
+ if (ret)
+ return ret;
if (scaling >= SCHED_TUNABLESCALING_END)
return -EINVAL;
@@ -243,7 +237,7 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
static int sched_dynamic_show(struct seq_file *m, void *v)
{
- int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2;
+ int i = (IS_ENABLED(CONFIG_PREEMPT_RT) || IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY)) * 2;
int j;
/* Count entries in NULL terminated preempt_modes */
@@ -336,17 +330,19 @@ enum dl_param {
DL_PERIOD,
};
-static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
-static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */
+static unsigned long dl_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
+static unsigned long dl_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */
-static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos, enum dl_param param)
+static ssize_t sched_server_write_common(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos, enum dl_param param,
+ void *server)
{
long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct sched_dl_entity *dl_se = (struct sched_dl_entity *)server;
+ u64 old_runtime, runtime, period;
struct rq *rq = cpu_rq(cpu);
- u64 runtime, period;
+ int retval = 0;
size_t err;
- int retval;
u64 value;
err = kstrtoull_from_user(ubuf, cnt, 10, &value);
@@ -354,8 +350,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
return err;
scoped_guard (rq_lock_irqsave, rq) {
- runtime = rq->fair_server.dl_runtime;
- period = rq->fair_server.dl_period;
+ old_runtime = runtime = dl_se->dl_runtime;
+ period = dl_se->dl_period;
switch (param) {
case DL_RUNTIME:
@@ -371,60 +367,68 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
}
if (runtime > period ||
- period > fair_server_period_max ||
- period < fair_server_period_min) {
+ period > dl_server_period_max ||
+ period < dl_server_period_min) {
return -EINVAL;
}
update_rq_clock(rq);
- dl_server_stop(&rq->fair_server);
-
- retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
- if (retval)
- cnt = retval;
+ dl_server_stop(dl_se);
+ retval = dl_server_apply_params(dl_se, runtime, period, 0);
+ dl_server_start(dl_se);
- if (!runtime)
- printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
- cpu_of(rq));
+ if (retval < 0)
+ return retval;
+ }
- if (rq->cfs.h_nr_queued)
- dl_server_start(&rq->fair_server);
+ if (!!old_runtime ^ !!runtime) {
+ pr_info("%s server %sabled on CPU %d%s.\n",
+ server == &rq->fair_server ? "Fair" : "Ext",
+ runtime ? "en" : "dis",
+ cpu_of(rq),
+ runtime ? "" : ", system may malfunction due to starvation");
}
*ppos += cnt;
return cnt;
}
-static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param)
+static size_t sched_server_show_common(struct seq_file *m, void *v, enum dl_param param,
+ void *server)
{
- unsigned long cpu = (unsigned long) m->private;
- struct rq *rq = cpu_rq(cpu);
+ struct sched_dl_entity *dl_se = (struct sched_dl_entity *)server;
u64 value;
switch (param) {
case DL_RUNTIME:
- value = rq->fair_server.dl_runtime;
+ value = dl_se->dl_runtime;
break;
case DL_PERIOD:
- value = rq->fair_server.dl_period;
+ value = dl_se->dl_period;
break;
}
seq_printf(m, "%llu\n", value);
return 0;
-
}
static ssize_t
sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
+ long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_write_common(filp, ubuf, cnt, ppos, DL_RUNTIME,
+ &rq->fair_server);
}
static int sched_fair_server_runtime_show(struct seq_file *m, void *v)
{
- return sched_fair_server_show(m, v, DL_RUNTIME);
+ unsigned long cpu = (unsigned long) m->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_show_common(m, v, DL_RUNTIME, &rq->fair_server);
}
static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp)
@@ -440,16 +444,57 @@ static const struct file_operations fair_server_runtime_fops = {
.release = single_release,
};
+#ifdef CONFIG_SCHED_CLASS_EXT
+static ssize_t
+sched_ext_server_runtime_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_write_common(filp, ubuf, cnt, ppos, DL_RUNTIME,
+ &rq->ext_server);
+}
+
+static int sched_ext_server_runtime_show(struct seq_file *m, void *v)
+{
+ unsigned long cpu = (unsigned long) m->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_show_common(m, v, DL_RUNTIME, &rq->ext_server);
+}
+
+static int sched_ext_server_runtime_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_ext_server_runtime_show, inode->i_private);
+}
+
+static const struct file_operations ext_server_runtime_fops = {
+ .open = sched_ext_server_runtime_open,
+ .write = sched_ext_server_runtime_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
static ssize_t
sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
+ long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD,
+ &rq->fair_server);
}
static int sched_fair_server_period_show(struct seq_file *m, void *v)
{
- return sched_fair_server_show(m, v, DL_PERIOD);
+ unsigned long cpu = (unsigned long) m->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_show_common(m, v, DL_PERIOD, &rq->fair_server);
}
static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
@@ -465,6 +510,40 @@ static const struct file_operations fair_server_period_fops = {
.release = single_release,
};
+#ifdef CONFIG_SCHED_CLASS_EXT
+static ssize_t
+sched_ext_server_period_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD,
+ &rq->ext_server);
+}
+
+static int sched_ext_server_period_show(struct seq_file *m, void *v)
+{
+ unsigned long cpu = (unsigned long) m->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_show_common(m, v, DL_PERIOD, &rq->ext_server);
+}
+
+static int sched_ext_server_period_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_ext_server_period_show, inode->i_private);
+}
+
+static const struct file_operations ext_server_period_fops = {
+ .open = sched_ext_server_period_open,
+ .write = sched_ext_server_period_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
static struct dentry *debugfs_sched;
static void debugfs_fair_server_init(void)
@@ -488,6 +567,29 @@ static void debugfs_fair_server_init(void)
}
}
+#ifdef CONFIG_SCHED_CLASS_EXT
+static void debugfs_ext_server_init(void)
+{
+ struct dentry *d_ext;
+ unsigned long cpu;
+
+ d_ext = debugfs_create_dir("ext_server", debugfs_sched);
+ if (!d_ext)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct dentry *d_cpu;
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "cpu%lu", cpu);
+ d_cpu = debugfs_create_dir(buf, d_ext);
+
+ debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &ext_server_runtime_fops);
+ debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &ext_server_period_fops);
+ }
+}
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
static __init int sched_init_debug(void)
{
struct dentry __maybe_unused *numa;
@@ -526,6 +628,9 @@ static __init int sched_init_debug(void)
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
debugfs_fair_server_init();
+#ifdef CONFIG_SCHED_CLASS_EXT
+ debugfs_ext_server_init();
+#endif
return 0;
}
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index afe28c04d5aa..c18e81e8ef51 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -194,6 +194,7 @@ MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microsecond
#include <trace/events/sched_ext.h>
static void process_ddsp_deferred_locals(struct rq *rq);
+static bool task_dead_and_done(struct task_struct *p);
static u32 reenq_local(struct rq *rq);
static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
@@ -958,6 +959,8 @@ static void update_curr_scx(struct rq *rq)
if (!curr->scx.slice)
touch_core_sched(rq, curr);
}
+
+ dl_server_update(&rq->ext_server, delta_exec);
}
static bool scx_dsq_priq_less(struct rb_node *node_a,
@@ -1501,6 +1504,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
if (enq_flags & SCX_ENQ_WAKEUP)
touch_core_sched(rq, p);
+ /* Start dl_server if this is the first task being enqueued */
+ if (rq->scx.nr_running == 1)
+ dl_server_start(&rq->ext_server);
+
do_enqueue_task(rq, p, enq_flags, sticky_cpu);
out:
rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
@@ -2453,7 +2460,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
/* see kick_cpus_irq_workfn() */
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
- rq_modified_clear(rq);
+ rq->next_class = &ext_sched_class;
rq_unpin_lock(rq, rf);
balance_one(rq, prev);
@@ -2468,7 +2475,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
* If @force_scx is true, always try to pick a SCHED_EXT task,
* regardless of any higher-priority sched classes activity.
*/
- if (!force_scx && rq_modified_above(rq, &ext_sched_class))
+ if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class))
return RETRY_TASK;
keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
@@ -2512,6 +2519,33 @@ static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
return do_pick_task_scx(rq, rf, false);
}
+/*
+ * Select the next task to run from the ext scheduling class.
+ *
+ * Use do_pick_task_scx() directly with @force_scx enabled, since the
+ * dl_server must always select a sched_ext task.
+ */
+static struct task_struct *
+ext_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
+{
+ if (!scx_enabled())
+ return NULL;
+
+ return do_pick_task_scx(dl_se->rq, rf, true);
+}
+
+/*
+ * Initialize the ext server deadline entity.
+ */
+void ext_server_init(struct rq *rq)
+{
+ struct sched_dl_entity *dl_se = &rq->ext_server;
+
+ init_dl_entity(dl_se);
+
+ dl_server_init(dl_se, rq, ext_server_pick_task);
+}
+
#ifdef CONFIG_SCHED_CORE
/**
* scx_prio_less - Task ordering for core-sched
@@ -2619,6 +2653,9 @@ static void set_cpus_allowed_scx(struct task_struct *p,
set_cpus_allowed_common(p, ac);
+ if (task_dead_and_done(p))
+ return;
+
/*
* The effective cpumask is stored in @p->cpus_ptr which may temporarily
* differ from the configured one in @p->cpus_mask. Always tell the bpf
@@ -3034,10 +3071,45 @@ void scx_cancel_fork(struct task_struct *p)
percpu_up_read(&scx_fork_rwsem);
}
+/**
+ * task_dead_and_done - Is a task dead and done running?
+ * @p: target task
+ *
+ * Once sched_ext_dead() removes the dead task from scx_tasks and exits it, the
+ * task no longer exists from SCX's POV. However, certain sched_class ops may be
+ * invoked on these dead tasks leading to failures - e.g. sched_setscheduler()
+ * may try to switch a task which finished sched_ext_dead() back into SCX
+ * triggering invalid SCX task state transitions and worse.
+ *
+ * Once a task has finished the final switch, sched_ext_dead() is the only thing
+ * that needs to happen on the task. Use this test to short-circuit sched_class
+ * operations which may be called on dead tasks.
+ */
+static bool task_dead_and_done(struct task_struct *p)
+{
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * In do_task_dead(), a dying task sets %TASK_DEAD with preemption
+ * disabled and __schedule(). If @p has %TASK_DEAD set and off CPU, @p
+ * won't ever run again.
+ */
+ return unlikely(READ_ONCE(p->__state) == TASK_DEAD) &&
+ !task_on_cpu(rq, p);
+}
+
void sched_ext_dead(struct task_struct *p)
{
unsigned long flags;
+ /*
+ * By the time control reaches here, @p has %TASK_DEAD set, switched out
+ * for the last time and then dropped the rq lock - task_dead_and_done()
+ * should be returning %true nullifying the straggling sched_class ops.
+ * Remove from scx_tasks and exit @p.
+ */
raw_spin_lock_irqsave(&scx_tasks_lock, flags);
list_del_init(&p->scx.tasks_node);
raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
@@ -3063,6 +3135,9 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
lockdep_assert_rq_held(task_rq(p));
+ if (task_dead_and_done(p))
+ return;
+
p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
if (SCX_HAS_OP(sch, set_weight))
SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
@@ -3077,6 +3152,9 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
{
struct scx_sched *sch = scx_root;
+ if (task_dead_and_done(p))
+ return;
+
scx_enable_task(p);
/*
@@ -3090,10 +3168,14 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
static void switched_from_scx(struct rq *rq, struct task_struct *p)
{
+ if (task_dead_and_done(p))
+ return;
+
scx_disable_task(p);
}
-static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {}
+
static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
int scx_check_setscheduler(struct task_struct *p, int policy)
@@ -3354,8 +3436,6 @@ static void scx_cgroup_unlock(void) {}
* their current sched_class. Call them directly from sched core instead.
*/
DEFINE_SCHED_CLASS(ext) = {
- .queue_mask = 1,
-
.enqueue_task = enqueue_task_scx,
.dequeue_task = dequeue_task_scx,
.yield_task = yield_task_scx,
@@ -3440,8 +3520,8 @@ static void destroy_dsq(struct scx_sched *sch, u64 dsq_id)
* operations inside scheduler locks.
*/
dsq->id = SCX_DSQ_INVALID;
- llist_add(&dsq->free_node, &dsqs_to_free);
- irq_work_queue(&free_dsq_irq_work);
+ if (llist_add(&dsq->free_node, &dsqs_to_free))
+ irq_work_queue(&free_dsq_irq_work);
out_unlock_dsq:
raw_spin_unlock_irqrestore(&dsq->lock, flags);
@@ -7227,9 +7307,9 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
-BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_exit_bstr)
+BTF_ID_FLAGS(func, scx_bpf_error_bstr)
+BTF_ID_FLAGS(func, scx_bpf_dump_bstr)
BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
@@ -7248,7 +7328,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED)
BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
#endif
BTF_ID_FLAGS(func, scx_bpf_now)
-BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_events)
BTF_KFUNCS_END(scx_kfunc_ids_any)
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e71302282671..1e22b7fadd70 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -524,10 +524,48 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
* Scheduling class tree data structure manipulation methods:
*/
+extern void __BUILD_BUG_vruntime_cmp(void);
+
+/* Use __builtin_strcmp() because of __HAVE_ARCH_STRCMP: */
+
+#define vruntime_cmp(A, CMP_STR, B) ({ \
+ int __res = 0; \
+ \
+ if (!__builtin_strcmp(CMP_STR, "<")) { \
+ __res = ((s64)((A)-(B)) < 0); \
+ } else if (!__builtin_strcmp(CMP_STR, "<=")) { \
+ __res = ((s64)((A)-(B)) <= 0); \
+ } else if (!__builtin_strcmp(CMP_STR, ">")) { \
+ __res = ((s64)((A)-(B)) > 0); \
+ } else if (!__builtin_strcmp(CMP_STR, ">=")) { \
+ __res = ((s64)((A)-(B)) >= 0); \
+ } else { \
+ /* Unknown operator throws linker error: */ \
+ __BUILD_BUG_vruntime_cmp(); \
+ } \
+ \
+ __res; \
+})
+
+extern void __BUILD_BUG_vruntime_op(void);
+
+#define vruntime_op(A, OP_STR, B) ({ \
+ s64 __res = 0; \
+ \
+ if (!__builtin_strcmp(OP_STR, "-")) { \
+ __res = (s64)((A)-(B)); \
+ } else { \
+ /* Unknown operator throws linker error: */ \
+ __BUILD_BUG_vruntime_op(); \
+ } \
+ \
+ __res; \
+})
+
+
static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
- s64 delta = (s64)(vruntime - max_vruntime);
- if (delta > 0)
+ if (vruntime_cmp(vruntime, ">", max_vruntime))
max_vruntime = vruntime;
return max_vruntime;
@@ -535,8 +573,7 @@ static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
- s64 delta = (s64)(vruntime - min_vruntime);
- if (delta < 0)
+ if (vruntime_cmp(vruntime, "<", min_vruntime))
min_vruntime = vruntime;
return min_vruntime;
@@ -549,12 +586,12 @@ static inline bool entity_before(const struct sched_entity *a,
* Tiebreak on vruntime seems unnecessary since it can
* hardly happen.
*/
- return (s64)(a->deadline - b->deadline) < 0;
+ return vruntime_cmp(a->deadline, "<", b->deadline);
}
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return (s64)(se->vruntime - cfs_rq->zero_vruntime);
+ return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime);
}
#define __node_2_se(node) \
@@ -576,7 +613,7 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* \Sum lag_i = 0
* \Sum w_i * (V - v_i) = 0
- * \Sum w_i * V - w_i * v_i = 0
+ * \Sum (w_i * V - w_i * v_i) = 0
*
* From which we can solve an expression for V in v_i (which we have in
* se->vruntime):
@@ -607,11 +644,11 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Which we track using:
*
* v0 := cfs_rq->zero_vruntime
- * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
- * \Sum w_i := cfs_rq->avg_load
+ * \Sum (v_i - v0) * w_i := cfs_rq->sum_w_vruntime
+ * \Sum w_i := cfs_rq->sum_weight
*
* Since zero_vruntime closely tracks the per-task service, these
- * deltas: (v_i - v), will be in the order of the maximal (virtual) lag
+ * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag
* induced in the system due to quantisation.
*
* Also, we use scale_load_down() to reduce the size.
@@ -619,32 +656,32 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
* As measured, the max (key * weight) value was ~44 bits for a kernel build.
*/
static void
-avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long weight = scale_load_down(se->load.weight);
s64 key = entity_key(cfs_rq, se);
- cfs_rq->avg_vruntime += key * weight;
- cfs_rq->avg_load += weight;
+ cfs_rq->sum_w_vruntime += key * weight;
+ cfs_rq->sum_weight += weight;
}
static void
-avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long weight = scale_load_down(se->load.weight);
s64 key = entity_key(cfs_rq, se);
- cfs_rq->avg_vruntime -= key * weight;
- cfs_rq->avg_load -= weight;
+ cfs_rq->sum_w_vruntime -= key * weight;
+ cfs_rq->sum_weight -= weight;
}
static inline
-void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
{
/*
- * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
+ * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight
*/
- cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
+ cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta;
}
/*
@@ -654,8 +691,8 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
u64 avg_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- s64 avg = cfs_rq->avg_vruntime;
- long load = cfs_rq->avg_load;
+ s64 avg = cfs_rq->sum_w_vruntime;
+ long load = cfs_rq->sum_weight;
if (curr && curr->on_rq) {
unsigned long weight = scale_load_down(curr->load.weight);
@@ -722,8 +759,8 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
{
struct sched_entity *curr = cfs_rq->curr;
- s64 avg = cfs_rq->avg_vruntime;
- long load = cfs_rq->avg_load;
+ s64 avg = cfs_rq->sum_w_vruntime;
+ long load = cfs_rq->sum_weight;
if (curr && curr->on_rq) {
unsigned long weight = scale_load_down(curr->load.weight);
@@ -732,7 +769,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
load += weight;
}
- return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load;
+ return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load;
}
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -743,9 +780,9 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void update_zero_vruntime(struct cfs_rq *cfs_rq)
{
u64 vruntime = avg_vruntime(cfs_rq);
- s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime);
+ s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime);
- avg_vruntime_update(cfs_rq, delta);
+ sum_w_vruntime_update(cfs_rq, delta);
cfs_rq->zero_vruntime = vruntime;
}
@@ -770,13 +807,12 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
return entity_before(__node_2_se(a), __node_2_se(b));
}
-#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
-
static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
{
if (node) {
struct sched_entity *rse = __node_2_se(node);
- if (vruntime_gt(min_vruntime, se, rse))
+
+ if (vruntime_cmp(se->min_vruntime, ">", rse->min_vruntime))
se->min_vruntime = rse->min_vruntime;
}
}
@@ -819,7 +855,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- avg_vruntime_add(cfs_rq, se);
+ sum_w_vruntime_add(cfs_rq, se);
update_zero_vruntime(cfs_rq);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
@@ -831,7 +867,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
&min_vruntime_cb);
- avg_vruntime_sub(cfs_rq, se);
+ sum_w_vruntime_sub(cfs_rq, se);
update_zero_vruntime(cfs_rq);
}
@@ -887,7 +923,7 @@ static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_enti
static inline bool protect_slice(struct sched_entity *se)
{
- return ((s64)(se->vprot - se->vruntime) > 0);
+ return vruntime_cmp(se->vruntime, "<", se->vprot);
}
static inline void cancel_protect_slice(struct sched_entity *se)
@@ -1024,7 +1060,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
*/
static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if ((s64)(se->vruntime - se->deadline) < 0)
+ if (vruntime_cmp(se->vruntime, "<", se->deadline))
return false;
/*
@@ -1513,7 +1549,7 @@ static unsigned int task_scan_start(struct task_struct *p)
/* Scale the maximum scan period with the amount of shared memory. */
rcu_read_lock();
- ng = rcu_dereference(p->numa_group);
+ ng = rcu_dereference_all(p->numa_group);
if (ng) {
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
@@ -1580,7 +1616,7 @@ pid_t task_numa_group_id(struct task_struct *p)
pid_t gid = 0;
rcu_read_lock();
- ng = rcu_dereference(p->numa_group);
+ ng = rcu_dereference_all(p->numa_group);
if (ng)
gid = ng->gid;
rcu_read_unlock();
@@ -2239,7 +2275,7 @@ static bool task_numa_compare(struct task_numa_env *env,
return false;
rcu_read_lock();
- cur = rcu_dereference(dst_rq->curr);
+ cur = rcu_dereference_all(dst_rq->curr);
if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) ||
!cur->mm))
cur = NULL;
@@ -2284,7 +2320,7 @@ static bool task_numa_compare(struct task_numa_env *env,
* If dst and source tasks are in the same NUMA group, or not
* in any group then look only at task weights.
*/
- cur_ng = rcu_dereference(cur->numa_group);
+ cur_ng = rcu_dereference_all(cur->numa_group);
if (cur_ng == p_ng) {
/*
* Do not swap within a group or between tasks that have
@@ -2458,11 +2494,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
maymove = !load_too_imbalanced(src_load, dst_load, env);
}
- for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
- /* Skip this CPU if the source task cannot migrate */
- if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
- continue;
-
+ /* Skip CPUs if the source task cannot migrate */
+ for_each_cpu_and(cpu, cpumask_of_node(env->dst_nid), env->p->cpus_ptr) {
env->dst_cpu = cpu;
if (task_numa_compare(env, taskimp, groupimp, maymove))
break;
@@ -2499,7 +2532,7 @@ static int task_numa_migrate(struct task_struct *p)
* to satisfy here.
*/
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+ sd = rcu_dereference_all(per_cpu(sd_numa, env.src_cpu));
if (sd) {
env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
env.imb_numa_nr = sd->imb_numa_nr;
@@ -2860,6 +2893,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
}
static void task_numa_placement(struct task_struct *p)
+ __context_unsafe(/* conditional locking */)
{
int seq, nid, max_nid = NUMA_NO_NODE;
unsigned long max_faults = 0;
@@ -3022,7 +3056,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!cpupid_match_pid(tsk, cpupid))
goto no_join;
- grp = rcu_dereference(tsk->numa_group);
+ grp = rcu_dereference_all(tsk->numa_group);
if (!grp)
goto no_join;
@@ -3693,7 +3727,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
#define add_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
- typeof(_val) val = (_val); \
+ __signed_scalar_typeof(*ptr) val = (_val); \
typeof(*ptr) res, var = READ_ONCE(*ptr); \
\
res = var + val; \
@@ -3705,23 +3739,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
} while (0)
/*
- * Unsigned subtract and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define sub_positive(_ptr, _val) do { \
- typeof(_ptr) ptr = (_ptr); \
- typeof(*ptr) val = (_val); \
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
- res = var - val; \
- if (res > var) \
- res = 0; \
- WRITE_ONCE(*ptr, res); \
-} while (0)
-
-/*
* Remove and clamp on negative, from a local variable.
*
* A variant of sub_positive(), which does not use explicit load-store
@@ -3732,21 +3749,39 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*ptr -= min_t(typeof(*ptr), *ptr, _val); \
} while (0)
+
+/*
+ * Because of rounding, se->util_sum might ends up being +1 more than
+ * cfs->util_sum. Although this is not a problem by itself, detaching
+ * a lot of tasks with the rounding problem between 2 updates of
+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
+ * cfs_util_avg is not.
+ *
+ * Check that util_sum is still above its lower bound for the new
+ * util_avg. Given that period_contrib might have moved since the last
+ * sync, we are only sure that util_sum must be above or equal to
+ * util_avg * minimum possible divider
+ */
+#define __update_sa(sa, name, delta_avg, delta_sum) do { \
+ add_positive(&(sa)->name##_avg, delta_avg); \
+ add_positive(&(sa)->name##_sum, delta_sum); \
+ (sa)->name##_sum = max_t(typeof((sa)->name##_sum), \
+ (sa)->name##_sum, \
+ (sa)->name##_avg * PELT_MIN_DIVIDER); \
+} while (0)
+
static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- cfs_rq->avg.load_avg += se->avg.load_avg;
- cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
+ __update_sa(&cfs_rq->avg, load, se->avg.load_avg,
+ se_weight(se) * se->avg.load_sum);
}
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
- cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, load, -se->avg.load_avg,
+ se_weight(se) * -se->avg.load_sum);
}
static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
@@ -4242,7 +4277,6 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
*/
divider = get_pelt_divider(&cfs_rq->avg);
-
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
new_sum = se->avg.util_avg * divider;
@@ -4250,12 +4284,7 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
se->avg.util_sum = new_sum;
/* Update parent cfs_rq utilization */
- add_positive(&cfs_rq->avg.util_avg, delta_avg);
- add_positive(&cfs_rq->avg.util_sum, delta_sum);
-
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
- cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, util, delta_avg, delta_sum);
}
static inline void
@@ -4281,11 +4310,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
se->avg.runnable_sum = new_sum;
/* Update parent cfs_rq runnable */
- add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
- add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
- cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, runnable, delta_avg, delta_sum);
}
static inline void
@@ -4349,11 +4374,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
se->avg.load_sum = runnable_sum;
se->avg.load_avg = load_avg;
- add_positive(&cfs_rq->avg.load_avg, delta_avg);
- add_positive(&cfs_rq->avg.load_sum, delta_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
- cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, load, delta_avg, delta_sum);
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -4450,7 +4471,7 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se)
rq = rq_of(cfs_rq);
rcu_read_lock();
- is_idle = is_idle_task(rcu_dereference(rq->curr));
+ is_idle = is_idle_task(rcu_dereference_all(rq->curr));
rcu_read_unlock();
/*
@@ -4552,33 +4573,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
raw_spin_unlock(&cfs_rq->removed.lock);
r = removed_load;
- sub_positive(&sa->load_avg, r);
- sub_positive(&sa->load_sum, r * divider);
- /* See sa->util_sum below */
- sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
+ __update_sa(sa, load, -r, -r*divider);
r = removed_util;
- sub_positive(&sa->util_avg, r);
- sub_positive(&sa->util_sum, r * divider);
- /*
- * Because of rounding, se->util_sum might ends up being +1 more than
- * cfs->util_sum. Although this is not a problem by itself, detaching
- * a lot of tasks with the rounding problem between 2 updates of
- * util_avg (~1ms) can make cfs->util_sum becoming null whereas
- * cfs_util_avg is not.
- * Check that util_sum is still above its lower bound for the new
- * util_avg. Given that period_contrib might have moved since the last
- * sync, we are only sure that util_sum must be above or equal to
- * util_avg * minimum possible divider
- */
- sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
+ __update_sa(sa, util, -r, -r*divider);
r = removed_runnable;
- sub_positive(&sa->runnable_avg, r);
- sub_positive(&sa->runnable_sum, r * divider);
- /* See sa->util_sum above */
- sa->runnable_sum = max_t(u32, sa->runnable_sum,
- sa->runnable_avg * PELT_MIN_DIVIDER);
+ __update_sa(sa, runnable, -r, -r*divider);
/*
* removed_runnable is the unweighted version of removed_load so we
@@ -4663,17 +4664,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
dequeue_load_avg(cfs_rq, se);
- sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
- cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
-
- sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
- sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
- cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, util, -se->avg.util_avg, -se->avg.util_sum);
+ __update_sa(&cfs_rq->avg, runnable, -se->avg.runnable_avg, -se->avg.runnable_sum);
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
@@ -4781,7 +4773,8 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
return cfs_rq->avg.load_avg;
}
-static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf);
+static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
+ __must_hold(__rq_lockp(this_rq));
static inline unsigned long task_util(struct task_struct *p)
{
@@ -5175,7 +5168,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*
* vl_i = (W + w_i)*vl'_i / W
*/
- load = cfs_rq->avg_load;
+ load = cfs_rq->sum_weight;
if (curr && curr->on_rq)
load += scale_load_down(curr->load.weight);
@@ -6188,6 +6181,7 @@ next:
* used to track this state.
*/
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
+ __must_hold(&cfs_b->lock)
{
int throttled;
@@ -7147,8 +7141,7 @@ static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
static struct {
cpumask_var_t idle_cpus_mask;
- atomic_t nr_cpus;
- int has_blocked; /* Idle CPUS has blocked load */
+ int has_blocked_load; /* Idle CPUS has blocked load */
int needs_update; /* Newly idle CPUs need their next_balance collated */
unsigned long next_balance; /* in jiffy units */
unsigned long next_blocked; /* Next update of blocked load in jiffies */
@@ -7506,7 +7499,7 @@ static inline void set_idle_cores(int cpu, int val)
{
struct sched_domain_shared *sds;
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
if (sds)
WRITE_ONCE(sds->has_idle_cores, val);
}
@@ -7515,7 +7508,7 @@ static inline bool test_idle_cores(int cpu)
{
struct sched_domain_shared *sds;
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
if (sds)
return READ_ONCE(sds->has_idle_cores);
@@ -7644,7 +7637,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
if (sched_feat(SIS_UTIL)) {
- sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
+ sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target));
if (sd_share) {
/* because !--nr is the condition to stop scan */
nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
@@ -7850,7 +7843,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* sd_asym_cpucapacity rather than sd_llc.
*/
if (sched_asym_cpucap_active()) {
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
+ sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, target));
/*
* On an asymmetric CPU capacity system where an exclusive
* cpuset defines a symmetric island (i.e. one unique
@@ -7865,7 +7858,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
}
}
- sd = rcu_dereference(per_cpu(sd_llc, target));
+ sd = rcu_dereference_all(per_cpu(sd_llc, target));
if (!sd)
return target;
@@ -8334,7 +8327,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
struct energy_env eenv;
rcu_read_lock();
- pd = rcu_dereference(rd->pd);
+ pd = rcu_dereference_all(rd->pd);
if (!pd)
goto unlock;
@@ -8342,7 +8335,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* Energy-aware wake-up happens on the lowest sched_domain starting
* from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
*/
- sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
+ sd = rcu_dereference_all(*this_cpu_ptr(&sd_asym_cpucapacity));
while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
sd = sd->parent;
if (!sd)
@@ -8365,9 +8358,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
int max_spare_cap_cpu = -1;
int fits, max_fits = -1;
- cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
-
- if (cpumask_empty(cpus))
+ if (!cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask))
continue;
/* Account external pressure for the energy estimation */
@@ -8744,7 +8735,7 @@ preempt_sync(struct rq *rq, int wake_flags,
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
+static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{
enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
struct task_struct *donor = rq->donor;
@@ -8752,6 +8743,12 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
+ /*
+ * XXX Getting preempted by higher class, try and find idle CPU?
+ */
+ if (p->sched_class != &fair_sched_class)
+ return;
+
if (unlikely(se == pse))
return;
@@ -8828,16 +8825,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
if ((wake_flags & WF_FORK) || pse->sched_delayed)
return;
- /*
- * If @p potentially is completing work required by current then
- * consider preemption.
- *
- * Reschedule if waker is no longer eligible. */
- if (in_task() && !entity_eligible(cfs_rq, se)) {
- preempt_action = PREEMPT_WAKEUP_RESCHED;
- goto preempt;
- }
-
/* Prefer picking wakee soon if appropriate. */
if (sched_feat(NEXT_BUDDY) &&
set_preempt_buddy(cfs_rq, wake_flags, pse, se)) {
@@ -8919,6 +8906,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ __must_hold(__rq_lockp(rq))
{
struct sched_entity *se;
struct task_struct *p;
@@ -8995,12 +8983,6 @@ idle:
goto again;
}
- /*
- * rq is about to be idle, check if we need to update the
- * lost_idle_time of clock_pelt
- */
- update_idle_rq_clock_pelt(rq);
-
return NULL;
}
@@ -9349,7 +9331,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
*/
static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
- struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ struct numa_group *numa_group = rcu_dereference_all(p->numa_group);
unsigned long src_weight, dst_weight;
int src_nid, dst_nid, dist;
@@ -9778,7 +9760,7 @@ static void attach_tasks(struct lb_env *env)
}
#ifdef CONFIG_NO_HZ_COMMON
-static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
+static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq)
{
if (cfs_rq->avg.load_avg)
return true;
@@ -9811,16 +9793,16 @@ static inline void update_blocked_load_tick(struct rq *rq)
WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
}
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load)
{
- if (!has_blocked)
+ if (!has_blocked_load)
rq->has_blocked_load = 0;
}
#else /* !CONFIG_NO_HZ_COMMON: */
-static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
+static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq) { return false; }
static inline bool others_have_blocked(struct rq *rq) { return false; }
static inline void update_blocked_load_tick(struct rq *rq) {}
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
+static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load) {}
#endif /* !CONFIG_NO_HZ_COMMON */
static bool __update_blocked_others(struct rq *rq, bool *done)
@@ -9877,7 +9859,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
list_del_leaf_cfs_rq(cfs_rq);
/* Don't need periodic decay once load/util_avg are null */
- if (cfs_rq_has_blocked(cfs_rq))
+ if (cfs_rq_has_blocked_load(cfs_rq))
*done = false;
}
@@ -9937,7 +9919,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
bool decayed;
decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
- if (cfs_rq_has_blocked(cfs_rq))
+ if (cfs_rq_has_blocked_load(cfs_rq))
*done = false;
return decayed;
@@ -9949,23 +9931,27 @@ static unsigned long task_h_load(struct task_struct *p)
}
#endif /* !CONFIG_FAIR_GROUP_SCHED */
-static void sched_balance_update_blocked_averages(int cpu)
+static void __sched_balance_update_blocked_averages(struct rq *rq)
{
bool decayed = false, done = true;
- struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
- rq_lock_irqsave(rq, &rf);
update_blocked_load_tick(rq);
- update_rq_clock(rq);
decayed |= __update_blocked_others(rq, &done);
decayed |= __update_blocked_fair(rq, &done);
- update_blocked_load_status(rq, !done);
+ update_has_blocked_load_status(rq, !done);
if (decayed)
cpufreq_update_util(rq, 0);
- rq_unlock_irqrestore(rq, &rf);
+}
+
+static void sched_balance_update_blocked_averages(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ guard(rq_lock_irqsave)(rq);
+ update_rq_clock(rq);
+ __sched_balance_update_blocked_averages(rq);
}
/********** Helpers for sched_balance_find_src_group ************************/
@@ -10975,10 +10961,9 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int
* take care of it.
*/
if (p->nr_cpus_allowed != NR_CPUS) {
- struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
-
- cpumask_and(cpus, sched_group_span(local), p->cpus_ptr);
- imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
+ unsigned int w = cpumask_weight_and(p->cpus_ptr,
+ sched_group_span(local));
+ imb_numa_nr = min(w, sd->imb_numa_nr);
}
imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
@@ -11025,7 +11010,7 @@ static void update_idle_cpu_scan(struct lb_env *env,
if (env->sd->span_weight != llc_weight)
return;
- sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
+ sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu));
if (!sd_share)
return;
@@ -11375,7 +11360,7 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
goto force_balance;
if (!is_rd_overutilized(env->dst_rq->rd) &&
- rcu_dereference(env->dst_rq->rd->pd))
+ rcu_dereference_all(env->dst_rq->rd->pd))
goto out_balanced;
/* ASYM feature bypasses nice load balance check */
@@ -12443,20 +12428,29 @@ static void nohz_balancer_kick(struct rq *rq)
*/
nohz_balance_exit_idle(rq);
- /*
- * None are in tickless mode and hence no need for NOHZ idle load
- * balancing:
- */
- if (likely(!atomic_read(&nohz.nr_cpus)))
- return;
-
- if (READ_ONCE(nohz.has_blocked) &&
+ if (READ_ONCE(nohz.has_blocked_load) &&
time_after(now, READ_ONCE(nohz.next_blocked)))
flags = NOHZ_STATS_KICK;
+ /*
+ * Most of the time system is not 100% busy. i.e nohz.nr_cpus > 0
+ * Skip the read if time is not due.
+ *
+ * If none are in tickless mode, there maybe a narrow window
+ * (28 jiffies, HZ=1000) where flags maybe set and kick_ilb called.
+ * But idle load balancing is not done as find_new_ilb fails.
+ * That's very rare. So read nohz.nr_cpus only if time is due.
+ */
if (time_before(now, nohz.next_balance))
goto out;
+ /*
+ * None are in tickless mode and hence no need for NOHZ idle load
+ * balancing
+ */
+ if (unlikely(cpumask_empty(nohz.idle_cpus_mask)))
+ return;
+
if (rq->nr_running >= 2) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto out;
@@ -12464,7 +12458,7 @@ static void nohz_balancer_kick(struct rq *rq)
rcu_read_lock();
- sd = rcu_dereference(rq->sd);
+ sd = rcu_dereference_all(rq->sd);
if (sd) {
/*
* If there's a runnable CFS task and the current CPU has reduced
@@ -12476,7 +12470,7 @@ static void nohz_balancer_kick(struct rq *rq)
}
}
- sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_asym_packing, cpu));
if (sd) {
/*
* When ASYM_PACKING; see if there's a more preferred CPU
@@ -12494,7 +12488,7 @@ static void nohz_balancer_kick(struct rq *rq)
}
}
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, cpu));
if (sd) {
/*
* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
@@ -12515,7 +12509,7 @@ static void nohz_balancer_kick(struct rq *rq)
goto unlock;
}
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
if (sds) {
/*
* If there is an imbalance between LLC domains (IOW we could
@@ -12547,7 +12541,7 @@ static void set_cpu_sd_state_busy(int cpu)
struct sched_domain *sd;
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
@@ -12567,7 +12561,6 @@ void nohz_balance_exit_idle(struct rq *rq)
rq->nohz_tick_stopped = 0;
cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
set_cpu_sd_state_busy(rq->cpu);
}
@@ -12577,7 +12570,7 @@ static void set_cpu_sd_state_idle(int cpu)
struct sched_domain *sd;
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
@@ -12611,9 +12604,9 @@ void nohz_balance_enter_idle(int cpu)
/*
* The tick is still stopped but load could have been added in the
- * meantime. We set the nohz.has_blocked flag to trig a check of the
+ * meantime. We set the nohz.has_blocked_load flag to trig a check of the
* *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
- * of nohz.has_blocked can only happen after checking the new load
+ * of nohz.has_blocked_load can only happen after checking the new load
*/
if (rq->nohz_tick_stopped)
goto out;
@@ -12625,11 +12618,10 @@ void nohz_balance_enter_idle(int cpu)
rq->nohz_tick_stopped = 1;
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
- atomic_inc(&nohz.nr_cpus);
/*
* Ensures that if nohz_idle_balance() fails to observe our
- * @idle_cpus_mask store, it must observe the @has_blocked
+ * @idle_cpus_mask store, it must observe the @has_blocked_load
* and @needs_update stores.
*/
smp_mb__after_atomic();
@@ -12642,7 +12634,7 @@ out:
* Each time a cpu enter idle, we assume that it has blocked load and
* enable the periodic update of the load of idle CPUs
*/
- WRITE_ONCE(nohz.has_blocked, 1);
+ WRITE_ONCE(nohz.has_blocked_load, 1);
}
static bool update_nohz_stats(struct rq *rq)
@@ -12683,8 +12675,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
/*
* We assume there will be no idle load after this update and clear
- * the has_blocked flag. If a cpu enters idle in the mean time, it will
- * set the has_blocked flag and trigger another update of idle load.
+ * the has_blocked_load flag. If a cpu enters idle in the mean time, it will
+ * set the has_blocked_load flag and trigger another update of idle load.
* Because a cpu that becomes idle, is added to idle_cpus_mask before
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
@@ -12692,12 +12684,12 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
* Same applies to idle_cpus_mask vs needs_update.
*/
if (flags & NOHZ_STATS_KICK)
- WRITE_ONCE(nohz.has_blocked, 0);
+ WRITE_ONCE(nohz.has_blocked_load, 0);
if (flags & NOHZ_NEXT_KICK)
WRITE_ONCE(nohz.needs_update, 0);
/*
- * Ensures that if we miss the CPU, we must see the has_blocked
+ * Ensures that if we miss the CPU, we must see the has_blocked_load
* store from nohz_balance_enter_idle().
*/
smp_mb();
@@ -12764,7 +12756,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
- WRITE_ONCE(nohz.has_blocked, 1);
+ WRITE_ONCE(nohz.has_blocked_load, 1);
}
/*
@@ -12826,7 +12818,7 @@ static void nohz_newidle_balance(struct rq *this_rq)
return;
/* Don't need to update blocked load of idle CPUs*/
- if (!READ_ONCE(nohz.has_blocked) ||
+ if (!READ_ONCE(nohz.has_blocked_load) ||
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
@@ -12858,6 +12850,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
* > 0 - success, new (fair) tasks present
*/
static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
+ __must_hold(__rq_lockp(this_rq))
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
@@ -12896,29 +12889,28 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
*/
rq_unpin_lock(this_rq, rf);
- rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq->sd);
- if (!sd) {
- rcu_read_unlock();
+ sd = rcu_dereference_sched_domain(this_rq->sd);
+ if (!sd)
goto out;
- }
if (!get_rd_overloaded(this_rq->rd) ||
this_rq->avg_idle < sd->max_newidle_lb_cost) {
update_next_balance(sd, &next_balance);
- rcu_read_unlock();
goto out;
}
- rcu_read_unlock();
-
- rq_modified_clear(this_rq);
- raw_spin_rq_unlock(this_rq);
+ /*
+ * Include sched_balance_update_blocked_averages() in the cost
+ * calculation because it can be quite costly -- this ensures we skip
+ * it when avg_idle gets to be very low.
+ */
t0 = sched_clock_cpu(this_cpu);
- sched_balance_update_blocked_averages(this_cpu);
+ __sched_balance_update_blocked_averages(this_rq);
+
+ this_rq->next_class = &fair_sched_class;
+ raw_spin_rq_unlock(this_rq);
- rcu_read_lock();
for_each_domain(this_cpu, sd) {
u64 domain_cost;
@@ -12968,7 +12960,6 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
if (pulled_task || !continue_balancing)
break;
}
- rcu_read_unlock();
raw_spin_rq_lock(this_rq);
@@ -12984,7 +12975,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
pulled_task = 1;
/* If a higher prio class was modified, restart the pick */
- if (rq_modified_above(this_rq, &fair_sched_class))
+ if (sched_class_above(this_rq->next_class, &fair_sched_class))
pulled_task = -1;
out:
@@ -13335,8 +13326,8 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
* zero_vruntime_fi, which would have been updated in prior calls
* to se_fi_update().
*/
- delta = (s64)(sea->vruntime - seb->vruntime) +
- (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi);
+ delta = vruntime_op(sea->vruntime, "-", seb->vruntime) +
+ vruntime_op(cfs_rqb->zero_vruntime_fi, "-", cfs_rqa->zero_vruntime_fi);
return delta > 0;
}
@@ -13374,6 +13365,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
entity_tick(cfs_rq, se, queued);
}
+ if (queued) {
+ if (!need_resched())
+ hrtick_start_fair(rq, curr);
+ return;
+ }
+
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
@@ -13882,15 +13879,12 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* All the scheduling class methods:
*/
DEFINE_SCHED_CLASS(fair) = {
-
- .queue_mask = 2,
-
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
.yield_to_task = yield_to_task_fair,
- .wakeup_preempt = check_preempt_wakeup_fair,
+ .wakeup_preempt = wakeup_preempt_fair,
.pick_task = pick_task_fair,
.pick_next_task = pick_next_task_fair,
@@ -13950,7 +13944,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
struct numa_group *ng;
rcu_read_lock();
- ng = rcu_dereference(p->numa_group);
+ ng = rcu_dereference_all(p->numa_group);
for_each_online_node(node) {
if (p->numa_faults) {
tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 980d92bab8ab..136a6584be79 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -29,7 +29,7 @@ SCHED_FEAT(PREEMPT_SHORT, true)
* wakeup-preemption), since its likely going to consume data we
* touched, increases cache locality.
*/
-SCHED_FEAT(NEXT_BUDDY, true)
+SCHED_FEAT(NEXT_BUDDY, false)
/*
* Allow completely ignoring cfs_rq->next; which can be set from various
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c174afe1dd17..3681b6ad9276 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -460,6 +460,7 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct t
{
update_curr_idle(rq);
scx_update_idle(rq, false, true);
+ update_rq_avg_idle(rq);
}
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
@@ -468,6 +469,12 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
scx_update_idle(rq, true, true);
schedstat_inc(rq->sched_goidle);
next->se.exec_start = rq_clock_task(rq);
+
+ /*
+ * rq is about to be idle, check if we need to update the
+ * lost_idle_time of clock_pelt
+ */
+ update_idle_rq_clock_pelt(rq);
}
struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf)
@@ -530,15 +537,15 @@ static void update_curr_idle(struct rq *rq)
se->exec_start = now;
dl_server_update_idle(&rq->fair_server, delta_exec);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ dl_server_update_idle(&rq->ext_server, delta_exec);
+#endif
}
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
DEFINE_SCHED_CLASS(idle) = {
-
- .queue_mask = 0,
-
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 3ad0d6df6a0a..3b725d39c06e 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -8,9 +8,11 @@
*
*/
#include <linux/sched/isolation.h>
+#include <linux/pci.h>
#include "sched.h"
enum hk_flags {
+ HK_FLAG_DOMAIN_BOOT = BIT(HK_TYPE_DOMAIN_BOOT),
HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ),
HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE),
@@ -20,7 +22,7 @@ DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
EXPORT_SYMBOL_GPL(housekeeping_overridden);
struct housekeeping {
- cpumask_var_t cpumasks[HK_TYPE_MAX];
+ struct cpumask __rcu *cpumasks[HK_TYPE_MAX];
unsigned long flags;
};
@@ -28,21 +30,62 @@ static struct housekeeping housekeeping;
bool housekeeping_enabled(enum hk_type type)
{
- return !!(housekeeping.flags & BIT(type));
+ return !!(READ_ONCE(housekeeping.flags) & BIT(type));
}
EXPORT_SYMBOL_GPL(housekeeping_enabled);
+static bool housekeeping_dereference_check(enum hk_type type)
+{
+ if (IS_ENABLED(CONFIG_LOCKDEP) && type == HK_TYPE_DOMAIN) {
+ /* Cpuset isn't even writable yet? */
+ if (system_state <= SYSTEM_SCHEDULING)
+ return true;
+
+ /* CPU hotplug write locked, so cpuset partition can't be overwritten */
+ if (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_write_held())
+ return true;
+
+ /* Cpuset lock held, partitions not writable */
+ if (IS_ENABLED(CONFIG_CPUSETS) && lockdep_is_cpuset_held())
+ return true;
+
+ return false;
+ }
+
+ return true;
+}
+
+static inline struct cpumask *housekeeping_cpumask_dereference(enum hk_type type)
+{
+ return rcu_dereference_all_check(housekeeping.cpumasks[type],
+ housekeeping_dereference_check(type));
+}
+
+const struct cpumask *housekeeping_cpumask(enum hk_type type)
+{
+ const struct cpumask *mask = NULL;
+
+ if (static_branch_unlikely(&housekeeping_overridden)) {
+ if (READ_ONCE(housekeeping.flags) & BIT(type))
+ mask = housekeeping_cpumask_dereference(type);
+ }
+ if (!mask)
+ mask = cpu_possible_mask;
+ return mask;
+}
+EXPORT_SYMBOL_GPL(housekeeping_cpumask);
+
int housekeeping_any_cpu(enum hk_type type)
{
int cpu;
if (static_branch_unlikely(&housekeeping_overridden)) {
if (housekeeping.flags & BIT(type)) {
- cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id());
+ cpu = sched_numa_find_closest(housekeeping_cpumask(type), smp_processor_id());
if (cpu < nr_cpu_ids)
return cpu;
- cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask);
+ cpu = cpumask_any_and_distribute(housekeeping_cpumask(type), cpu_online_mask);
if (likely(cpu < nr_cpu_ids))
return cpu;
/*
@@ -58,32 +101,69 @@ int housekeeping_any_cpu(enum hk_type type)
}
EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
-const struct cpumask *housekeeping_cpumask(enum hk_type type)
-{
- if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping.flags & BIT(type))
- return housekeeping.cpumasks[type];
- return cpu_possible_mask;
-}
-EXPORT_SYMBOL_GPL(housekeeping_cpumask);
-
void housekeeping_affine(struct task_struct *t, enum hk_type type)
{
if (static_branch_unlikely(&housekeeping_overridden))
if (housekeeping.flags & BIT(type))
- set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]);
+ set_cpus_allowed_ptr(t, housekeeping_cpumask(type));
}
EXPORT_SYMBOL_GPL(housekeeping_affine);
bool housekeeping_test_cpu(int cpu, enum hk_type type)
{
- if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping.flags & BIT(type))
- return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
+ if (static_branch_unlikely(&housekeeping_overridden) &&
+ READ_ONCE(housekeeping.flags) & BIT(type))
+ return cpumask_test_cpu(cpu, housekeeping_cpumask(type));
return true;
}
EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
+int housekeeping_update(struct cpumask *isol_mask)
+{
+ struct cpumask *trial, *old = NULL;
+ int err;
+
+ lockdep_assert_cpus_held();
+
+ trial = kmalloc(cpumask_size(), GFP_KERNEL);
+ if (!trial)
+ return -ENOMEM;
+
+ cpumask_andnot(trial, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT), isol_mask);
+ if (!cpumask_intersects(trial, cpu_online_mask)) {
+ kfree(trial);
+ return -EINVAL;
+ }
+
+ if (!housekeeping.flags)
+ static_branch_enable_cpuslocked(&housekeeping_overridden);
+
+ if (housekeeping.flags & HK_FLAG_DOMAIN)
+ old = housekeeping_cpumask_dereference(HK_TYPE_DOMAIN);
+ else
+ WRITE_ONCE(housekeeping.flags, housekeeping.flags | HK_FLAG_DOMAIN);
+ rcu_assign_pointer(housekeeping.cpumasks[HK_TYPE_DOMAIN], trial);
+
+ synchronize_rcu();
+
+ pci_probe_flush_workqueue();
+ mem_cgroup_flush_workqueue();
+ vmstat_flush_workqueue();
+
+ err = workqueue_unbound_housekeeping_update(housekeeping_cpumask(HK_TYPE_DOMAIN));
+ WARN_ON_ONCE(err < 0);
+
+ err = tmigr_isolated_exclude_cpumask(isol_mask);
+ WARN_ON_ONCE(err < 0);
+
+ err = kthreads_update_housekeeping();
+ WARN_ON_ONCE(err < 0);
+
+ kfree(old);
+
+ return 0;
+}
+
void __init housekeeping_init(void)
{
enum hk_type type;
@@ -95,20 +175,33 @@ void __init housekeeping_init(void)
if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
sched_tick_offload_init();
-
+ /*
+ * Realloc with a proper allocator so that any cpumask update
+ * can indifferently free the old version with kfree().
+ */
for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
+ struct cpumask *omask, *nmask = kmalloc(cpumask_size(), GFP_KERNEL);
+
+ if (WARN_ON_ONCE(!nmask))
+ return;
+
+ omask = rcu_dereference(housekeeping.cpumasks[type]);
+
/* We need at least one CPU to handle housekeeping work */
- WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type]));
+ WARN_ON_ONCE(cpumask_empty(omask));
+ cpumask_copy(nmask, omask);
+ RCU_INIT_POINTER(housekeeping.cpumasks[type], nmask);
+ memblock_free(omask, cpumask_size());
}
}
static void __init housekeeping_setup_type(enum hk_type type,
cpumask_var_t housekeeping_staging)
{
+ struct cpumask *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES);
- alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]);
- cpumask_copy(housekeeping.cpumasks[type],
- housekeeping_staging);
+ cpumask_copy(mask, housekeeping_staging);
+ RCU_INIT_POINTER(housekeeping.cpumasks[type], mask);
}
static int __init housekeeping_setup(char *str, unsigned long flags)
@@ -161,7 +254,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
if (!cpumask_equal(housekeeping_staging,
- housekeeping.cpumasks[type])) {
+ housekeeping_cpumask(type))) {
pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
goto free_housekeeping_staging;
}
@@ -182,7 +275,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
iter_flags = flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
first_cpu = (type == HK_TYPE_MAX || !iter_flags) ? 0 :
cpumask_first_and_and(cpu_present_mask,
- housekeeping_staging, housekeeping.cpumasks[type]);
+ housekeeping_staging, housekeeping_cpumask(type));
if (first_cpu >= min(nr_cpu_ids, setup_max_cpus)) {
pr_warn("Housekeeping: must include one present CPU "
"neither in nohz_full= nor in isolcpus=domain, "
@@ -239,7 +332,7 @@ static int __init housekeeping_isolcpus_setup(char *str)
if (!strncmp(str, "domain,", 7)) {
str += 7;
- flags |= HK_FLAG_DOMAIN;
+ flags |= HK_FLAG_DOMAIN | HK_FLAG_DOMAIN_BOOT;
continue;
}
@@ -269,7 +362,7 @@ static int __init housekeeping_isolcpus_setup(char *str)
/* Default behaviour for isolcpus without flags */
if (!flags)
- flags |= HK_FLAG_DOMAIN;
+ flags |= HK_FLAG_DOMAIN | HK_FLAG_DOMAIN_BOOT;
return housekeeping_setup(str, flags);
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f1867fe8e5c5..a7680477fa6f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *donor = rq->donor;
+ /*
+ * XXX If we're preempted by DL, queue a push?
+ */
+ if (p->sched_class != &rt_sched_class)
+ return;
+
if (p->prio < donor->prio) {
resched_curr(rq);
return;
@@ -2100,6 +2106,7 @@ static void push_rt_tasks(struct rq *rq)
*/
static int rto_next_cpu(struct root_domain *rd)
{
+ int this_cpu = smp_processor_id();
int next;
int cpu;
@@ -2123,6 +2130,10 @@ static int rto_next_cpu(struct root_domain *rd)
rd->rto_cpu = cpu;
+ /* Do not send IPI to self */
+ if (cpu == this_cpu)
+ continue;
+
if (cpu < nr_cpu_ids)
return cpu;
@@ -2568,9 +2579,6 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
#endif /* CONFIG_SCHED_CORE */
DEFINE_SCHED_CLASS(rt) = {
-
- .queue_mask = 4,
-
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 93fce4bbff5e..b82fb70a9d54 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -30,6 +30,7 @@
#include <linux/context_tracking.h>
#include <linux/cpufreq.h>
#include <linux/cpumask_api.h>
+#include <linux/cpuset.h>
#include <linux/ctype.h>
#include <linux/file.h>
#include <linux/fs_api.h>
@@ -42,6 +43,8 @@
#include <linux/ktime_api.h>
#include <linux/lockdep_api.h>
#include <linux/lockdep.h>
+#include <linux/memblock.h>
+#include <linux/memcontrol.h>
#include <linux/minmax.h>
#include <linux/mm.h>
#include <linux/module.h>
@@ -65,6 +68,7 @@
#include <linux/types.h>
#include <linux/u64_stats_sync_api.h>
#include <linux/uaccess.h>
+#include <linux/vmstat.h>
#include <linux/wait_api.h>
#include <linux/wait_bit.h>
#include <linux/workqueue_api.h>
@@ -82,9 +86,8 @@ struct rt_rq;
struct sched_group;
struct cpuidle_state;
-#ifdef CONFIG_PARAVIRT
+#if defined(CONFIG_PARAVIRT) && !defined(CONFIG_HAVE_PV_STEAL_CLOCK_GEN)
# include <asm/paravirt.h>
-# include <asm/paravirt_api_clock.h>
#endif
#include <asm/barrier.h>
@@ -414,6 +417,7 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
extern void sched_init_dl_servers(void);
extern void fair_server_init(struct rq *rq);
+extern void ext_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
u64 runtime, u64 period, bool init);
@@ -670,16 +674,16 @@ struct balance_callback {
void (*func)(struct rq *rq);
};
-/* CFS-related fields in a runqueue */
+/* Fair scheduling SCHED_{NORMAL,BATCH,IDLE} related fields in a runqueue: */
struct cfs_rq {
struct load_weight load;
unsigned int nr_queued;
- unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
- unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
- unsigned int h_nr_idle; /* SCHED_IDLE */
+ unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_idle; /* SCHED_IDLE */
- s64 avg_vruntime;
- u64 avg_load;
+ s64 sum_w_vruntime;
+ u64 sum_weight;
u64 zero_vruntime;
#ifdef CONFIG_SCHED_CORE
@@ -690,7 +694,7 @@ struct cfs_rq {
struct rb_root_cached tasks_timeline;
/*
- * 'curr' points to currently running entity on this cfs_rq.
+ * 'curr' points to the currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr;
@@ -726,9 +730,7 @@ struct cfs_rq {
unsigned long h_load;
u64 last_h_load_update;
struct sched_entity *h_load_next;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
/*
@@ -741,19 +743,19 @@ struct cfs_rq {
*/
int on_list;
struct list_head leaf_cfs_rq_list;
- struct task_group *tg; /* group that "owns" this runqueue */
+ struct task_group *tg; /* Group that "owns" this runqueue */
/* Locally cached copy of our task_group's idle value */
int idle;
-#ifdef CONFIG_CFS_BANDWIDTH
+# ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
s64 runtime_remaining;
u64 throttled_pelt_idle;
-#ifndef CONFIG_64BIT
+# ifndef CONFIG_64BIT
u64 throttled_pelt_idle_copy;
-#endif
+# endif
u64 throttled_clock;
u64 throttled_clock_pelt;
u64 throttled_clock_pelt_time;
@@ -765,7 +767,7 @@ struct cfs_rq {
struct list_head throttled_list;
struct list_head throttled_csd_list;
struct list_head throttled_limbo_list;
-#endif /* CONFIG_CFS_BANDWIDTH */
+# endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -1117,28 +1119,50 @@ DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
* acquire operations must be ordered by ascending &runqueue.
*/
struct rq {
- /* runqueue lock: */
- raw_spinlock_t __lock;
-
- /* Per class runqueue modification mask; bits in class order. */
- unsigned int queue_mask;
+ /*
+ * The following members are loaded together, without holding the
+ * rq->lock, in an extremely hot loop in update_sg_lb_stats()
+ * (called from pick_next_task()). To reduce cache pollution from
+ * this operation, they are placed together on this dedicated cache
+ * line. Even though some of them are frequently modified, they are
+ * loaded much more frequently than they are stored.
+ */
unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
- unsigned int numa_migrate_on;
#endif
+ unsigned int ttwu_pending;
+ unsigned long cpu_capacity;
+#ifdef CONFIG_SCHED_PROXY_EXEC
+ struct task_struct __rcu *donor; /* Scheduling context */
+ struct task_struct __rcu *curr; /* Execution context */
+#else
+ union {
+ struct task_struct __rcu *donor; /* Scheduler context */
+ struct task_struct __rcu *curr; /* Execution context */
+ };
+#endif
+ struct task_struct *idle;
+ /* padding left here deliberately */
+
+ /*
+ * The next cacheline holds the (hot) runqueue lock, as well as
+ * some other less performance-critical fields.
+ */
+ u64 nr_switches ____cacheline_aligned;
+
+ /* runqueue lock: */
+ raw_spinlock_t __lock;
+
#ifdef CONFIG_NO_HZ_COMMON
- unsigned long last_blocked_load_update_tick;
- unsigned int has_blocked_load;
- call_single_data_t nohz_csd;
unsigned int nohz_tick_stopped;
atomic_t nohz_flags;
+ unsigned int has_blocked_load;
+ unsigned long last_blocked_load_update_tick;
+ call_single_data_t nohz_csd;
#endif /* CONFIG_NO_HZ_COMMON */
- unsigned int ttwu_pending;
- u64 nr_switches;
-
#ifdef CONFIG_UCLAMP_TASK
/* Utilization clamp values based on CPU's RUNNABLE tasks */
struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned;
@@ -1151,6 +1175,7 @@ struct rq {
struct dl_rq dl;
#ifdef CONFIG_SCHED_CLASS_EXT
struct scx_rq scx;
+ struct sched_dl_entity ext_server;
#endif
struct sched_dl_entity fair_server;
@@ -1161,6 +1186,9 @@ struct rq {
struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned int numa_migrate_on;
+#endif
/*
* This is part of a global counter where only the total sum
* over all CPUs matters. A task can increase this counter on
@@ -1169,36 +1197,29 @@ struct rq {
*/
unsigned long nr_uninterruptible;
-#ifdef CONFIG_SCHED_PROXY_EXEC
- struct task_struct __rcu *donor; /* Scheduling context */
- struct task_struct __rcu *curr; /* Execution context */
-#else
- union {
- struct task_struct __rcu *donor; /* Scheduler context */
- struct task_struct __rcu *curr; /* Execution context */
- };
-#endif
struct sched_dl_entity *dl_server;
- struct task_struct *idle;
struct task_struct *stop;
+ const struct sched_class *next_class;
unsigned long next_balance;
struct mm_struct *prev_mm;
- unsigned int clock_update_flags;
- u64 clock;
- /* Ensure that all clocks are in the same cache line */
+ /*
+ * The following fields of clock data are frequently referenced
+ * and updated together, and should go on their own cache line.
+ */
u64 clock_task ____cacheline_aligned;
u64 clock_pelt;
+ u64 clock;
unsigned long lost_idle_time;
+ unsigned int clock_update_flags;
u64 clock_pelt_idle;
u64 clock_idle;
+
#ifndef CONFIG_64BIT
u64 clock_pelt_idle_copy;
u64 clock_idle_copy;
#endif
- atomic_t nr_iowait;
-
u64 last_seen_need_resched_ns;
int ticks_without_resched;
@@ -1209,8 +1230,6 @@ struct rq {
struct root_domain *rd;
struct sched_domain __rcu *sd;
- unsigned long cpu_capacity;
-
struct balance_callback *balance_callback;
unsigned char nohz_idle_balance;
@@ -1320,7 +1339,9 @@ struct rq {
call_single_data_t cfsb_csd;
struct list_head cfsb_csd_list;
#endif
-};
+
+ atomic_t nr_iowait;
+} __no_randomize_layout;
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1358,8 +1379,13 @@ static inline u32 sched_rng(void)
return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
}
+static __always_inline struct rq *__this_rq(void)
+{
+ return this_cpu_ptr(&runqueues);
+}
+
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
-#define this_rq() this_cpu_ptr(&runqueues)
+#define this_rq() __this_rq()
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
@@ -1426,6 +1452,7 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq)
}
static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
+ __returns_ctx_lock(rq_lockp(rq)) /* alias them */
{
if (rq->core_enabled)
return &rq->core->__lock;
@@ -1525,6 +1552,7 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq)
}
static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
+ __returns_ctx_lock(rq_lockp(rq)) /* alias them */
{
return &rq->__lock;
}
@@ -1567,32 +1595,42 @@ static inline bool rt_group_sched_enabled(void)
#endif /* !CONFIG_RT_GROUP_SCHED */
static inline void lockdep_assert_rq_held(struct rq *rq)
+ __assumes_ctx_lock(__rq_lockp(rq))
{
lockdep_assert_held(__rq_lockp(rq));
}
-extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass);
-extern bool raw_spin_rq_trylock(struct rq *rq);
-extern void raw_spin_rq_unlock(struct rq *rq);
+extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
+ __acquires(__rq_lockp(rq));
+
+extern bool raw_spin_rq_trylock(struct rq *rq)
+ __cond_acquires(true, __rq_lockp(rq));
+
+extern void raw_spin_rq_unlock(struct rq *rq)
+ __releases(__rq_lockp(rq));
static inline void raw_spin_rq_lock(struct rq *rq)
+ __acquires(__rq_lockp(rq))
{
raw_spin_rq_lock_nested(rq, 0);
}
static inline void raw_spin_rq_lock_irq(struct rq *rq)
+ __acquires(__rq_lockp(rq))
{
local_irq_disable();
raw_spin_rq_lock(rq);
}
static inline void raw_spin_rq_unlock_irq(struct rq *rq)
+ __releases(__rq_lockp(rq))
{
raw_spin_rq_unlock(rq);
local_irq_enable();
}
static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq)
+ __acquires(__rq_lockp(rq))
{
unsigned long flags;
@@ -1603,6 +1641,7 @@ static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq)
}
static inline void raw_spin_rq_unlock_irqrestore(struct rq *rq, unsigned long flags)
+ __releases(__rq_lockp(rq))
{
raw_spin_rq_unlock(rq);
local_irq_restore(flags);
@@ -1676,6 +1715,7 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
#endif /* !CONFIG_FAIR_GROUP_SCHED */
+extern void update_rq_avg_idle(struct rq *rq);
extern void update_rq_clock(struct rq *rq);
/*
@@ -1851,18 +1891,16 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
rq->clock_update_flags |= rf->clock_update_flags;
}
-extern
-struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
- __acquires(rq->lock);
+#define __task_rq_lock(...) __acquire_ret(___task_rq_lock(__VA_ARGS__), __rq_lockp(__ret))
+extern struct rq *___task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires_ret;
-extern
-struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
- __acquires(p->pi_lock)
- __acquires(rq->lock);
+#define task_rq_lock(...) __acquire_ret(_task_rq_lock(__VA_ARGS__), __rq_lockp(__ret))
+extern struct rq *_task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(&p->pi_lock) __acquires_ret;
static inline void
__task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
- __releases(rq->lock)
+ __releases(__rq_lockp(rq))
{
rq_unpin_lock(rq, rf);
raw_spin_rq_unlock(rq);
@@ -1870,8 +1908,7 @@ __task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
static inline void
task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
- __releases(rq->lock)
- __releases(p->pi_lock)
+ __releases(__rq_lockp(rq), &p->pi_lock)
{
__task_rq_unlock(rq, p, rf);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
@@ -1881,6 +1918,8 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
_T->rq = task_rq_lock(_T->lock, &_T->rf),
task_rq_unlock(_T->rq, _T->lock, &_T->rf),
struct rq *rq; struct rq_flags rf)
+DECLARE_LOCK_GUARD_1_ATTRS(task_rq_lock, __acquires(_T->pi_lock), __releases((*(struct task_struct **)_T)->pi_lock))
+#define class_task_rq_lock_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(task_rq_lock, _T)
DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
_T->rq = __task_rq_lock(_T->lock, &_T->rf),
@@ -1888,42 +1927,42 @@ DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
struct rq *rq; struct rq_flags rf)
static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
- __acquires(rq->lock)
+ __acquires(__rq_lockp(rq))
{
raw_spin_rq_lock_irqsave(rq, rf->flags);
rq_pin_lock(rq, rf);
}
static inline void rq_lock_irq(struct rq *rq, struct rq_flags *rf)
- __acquires(rq->lock)
+ __acquires(__rq_lockp(rq))
{
raw_spin_rq_lock_irq(rq);
rq_pin_lock(rq, rf);
}
static inline void rq_lock(struct rq *rq, struct rq_flags *rf)
- __acquires(rq->lock)
+ __acquires(__rq_lockp(rq))
{
raw_spin_rq_lock(rq);
rq_pin_lock(rq, rf);
}
static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
- __releases(rq->lock)
+ __releases(__rq_lockp(rq))
{
rq_unpin_lock(rq, rf);
raw_spin_rq_unlock_irqrestore(rq, rf->flags);
}
static inline void rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
- __releases(rq->lock)
+ __releases(__rq_lockp(rq))
{
rq_unpin_lock(rq, rf);
raw_spin_rq_unlock_irq(rq);
}
static inline void rq_unlock(struct rq *rq, struct rq_flags *rf)
- __releases(rq->lock)
+ __releases(__rq_lockp(rq))
{
rq_unpin_lock(rq, rf);
raw_spin_rq_unlock(rq);
@@ -1934,18 +1973,27 @@ DEFINE_LOCK_GUARD_1(rq_lock, struct rq,
rq_unlock(_T->lock, &_T->rf),
struct rq_flags rf)
+DECLARE_LOCK_GUARD_1_ATTRS(rq_lock, __acquires(__rq_lockp(_T)), __releases(__rq_lockp(*(struct rq **)_T)));
+#define class_rq_lock_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(rq_lock, _T)
+
DEFINE_LOCK_GUARD_1(rq_lock_irq, struct rq,
rq_lock_irq(_T->lock, &_T->rf),
rq_unlock_irq(_T->lock, &_T->rf),
struct rq_flags rf)
+DECLARE_LOCK_GUARD_1_ATTRS(rq_lock_irq, __acquires(__rq_lockp(_T)), __releases(__rq_lockp(*(struct rq **)_T)));
+#define class_rq_lock_irq_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(rq_lock_irq, _T)
+
DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq,
rq_lock_irqsave(_T->lock, &_T->rf),
rq_unlock_irqrestore(_T->lock, &_T->rf),
struct rq_flags rf)
-static inline struct rq *this_rq_lock_irq(struct rq_flags *rf)
- __acquires(rq->lock)
+DECLARE_LOCK_GUARD_1_ATTRS(rq_lock_irqsave, __acquires(__rq_lockp(_T)), __releases(__rq_lockp(*(struct rq **)_T)));
+#define class_rq_lock_irqsave_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(rq_lock_irqsave, _T)
+
+#define this_rq_lock_irq(...) __acquire_ret(_this_rq_lock_irq(__VA_ARGS__), __rq_lockp(__ret))
+static inline struct rq *_this_rq_lock_irq(struct rq_flags *rf) __acquires_ret
{
struct rq *rq;
@@ -2032,8 +2080,8 @@ queue_balance_callback(struct rq *rq,
rq->balance_callback = head;
}
-#define rcu_dereference_check_sched_domain(p) \
- rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex))
+#define rcu_dereference_sched_domain(p) \
+ rcu_dereference_all_check((p), lockdep_is_held(&sched_domains_mutex))
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
@@ -2043,7 +2091,7 @@ queue_balance_callback(struct rq *rq,
* preempt-disabled sections.
*/
#define for_each_domain(cpu, __sd) \
- for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
+ for (__sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd); \
__sd; __sd = __sd->parent)
/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */
@@ -2451,15 +2499,6 @@ struct sched_class {
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
#endif
- /*
- * idle: 0
- * ext: 1
- * fair: 2
- * rt: 4
- * dl: 8
- * stop: 16
- */
- unsigned int queue_mask;
/*
* move_queued_task/activate_task/enqueue_task: rq->lock
@@ -2618,20 +2657,6 @@ struct sched_class {
#endif
};
-/*
- * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
- */
-static inline void rq_modified_clear(struct rq *rq)
-{
- rq->queue_mask = 0;
-}
-
-static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
-{
- unsigned int mask = class->queue_mask;
- return rq->queue_mask & ~((mask << 1) - 1);
-}
-
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
WARN_ON_ONCE(rq->donor != prev);
@@ -3073,8 +3098,20 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \
__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \
static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \
+ __no_context_analysis \
{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \
_lock; return _t; }
+#define DECLARE_LOCK_GUARD_2_ATTRS(_name, _lock, _unlock1, _unlock2) \
+static inline class_##_name##_t class_##_name##_constructor(lock_##_name##_t *_T1, \
+ lock_##_name##_t *_T2) _lock; \
+static __always_inline void __class_##_name##_cleanup_ctx1(class_##_name##_t **_T1) \
+ __no_context_analysis _unlock1 { } \
+static __always_inline void __class_##_name##_cleanup_ctx2(class_##_name##_t **_T2) \
+ __no_context_analysis _unlock2 { }
+#define WITH_LOCK_GUARD_2_ATTRS(_name, _T1, _T2) \
+ class_##_name##_constructor(_T1, _T2), \
+ *__UNIQUE_ID(unlock1) __cleanup(__class_##_name##_cleanup_ctx1) = (void *)(_T1),\
+ *__UNIQUE_ID(unlock2) __cleanup(__class_##_name##_cleanup_ctx2) = (void *)(_T2)
static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
{
@@ -3102,7 +3139,8 @@ static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
return rq1->cpu < rq2->cpu;
}
-extern void double_rq_lock(struct rq *rq1, struct rq *rq2);
+extern void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(__rq_lockp(rq1), __rq_lockp(rq2));
#ifdef CONFIG_PREEMPTION
@@ -3115,9 +3153,8 @@ extern void double_rq_lock(struct rq *rq1, struct rq *rq2);
* also adds more overhead and therefore may reduce throughput.
*/
static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
- __releases(this_rq->lock)
- __acquires(busiest->lock)
- __acquires(this_rq->lock)
+ __must_hold(__rq_lockp(this_rq))
+ __acquires(__rq_lockp(busiest))
{
raw_spin_rq_unlock(this_rq);
double_rq_lock(this_rq, busiest);
@@ -3134,12 +3171,16 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
* regardless of entry order into the function.
*/
static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
- __releases(this_rq->lock)
- __acquires(busiest->lock)
- __acquires(this_rq->lock)
+ __must_hold(__rq_lockp(this_rq))
+ __acquires(__rq_lockp(busiest))
{
- if (__rq_lockp(this_rq) == __rq_lockp(busiest) ||
- likely(raw_spin_rq_trylock(busiest))) {
+ if (__rq_lockp(this_rq) == __rq_lockp(busiest)) {
+ __acquire(__rq_lockp(busiest)); /* already held */
+ double_rq_clock_clear_update(this_rq, busiest);
+ return 0;
+ }
+
+ if (likely(raw_spin_rq_trylock(busiest))) {
double_rq_clock_clear_update(this_rq, busiest);
return 0;
}
@@ -3162,6 +3203,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
*/
static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ __must_hold(__rq_lockp(this_rq))
+ __acquires(__rq_lockp(busiest))
{
lockdep_assert_irqs_disabled();
@@ -3169,14 +3212,17 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
}
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
- __releases(busiest->lock)
+ __releases(__rq_lockp(busiest))
{
if (__rq_lockp(this_rq) != __rq_lockp(busiest))
raw_spin_rq_unlock(busiest);
+ else
+ __release(__rq_lockp(busiest)); /* fake release */
lock_set_subclass(&__rq_lockp(this_rq)->dep_map, 0, _RET_IP_);
}
static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
+ __acquires(l1, l2)
{
if (l1 > l2)
swap(l1, l2);
@@ -3186,6 +3232,7 @@ static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
}
static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2)
+ __acquires(l1, l2)
{
if (l1 > l2)
swap(l1, l2);
@@ -3195,6 +3242,7 @@ static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2)
}
static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
+ __acquires(l1, l2)
{
if (l1 > l2)
swap(l1, l2);
@@ -3204,6 +3252,7 @@ static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
}
static inline void double_raw_unlock(raw_spinlock_t *l1, raw_spinlock_t *l2)
+ __releases(l1, l2)
{
raw_spin_unlock(l1);
raw_spin_unlock(l2);
@@ -3213,6 +3262,13 @@ DEFINE_LOCK_GUARD_2(double_raw_spinlock, raw_spinlock_t,
double_raw_lock(_T->lock, _T->lock2),
double_raw_unlock(_T->lock, _T->lock2))
+DECLARE_LOCK_GUARD_2_ATTRS(double_raw_spinlock,
+ __acquires(_T1, _T2),
+ __releases(*(raw_spinlock_t **)_T1),
+ __releases(*(raw_spinlock_t **)_T2));
+#define class_double_raw_spinlock_constructor(_T1, _T2) \
+ WITH_LOCK_GUARD_2_ATTRS(double_raw_spinlock, _T1, _T2)
+
/*
* double_rq_unlock - safely unlock two runqueues
*
@@ -3220,13 +3276,12 @@ DEFINE_LOCK_GUARD_2(double_raw_spinlock, raw_spinlock_t,
* you need to do so manually after calling.
*/
static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
- __releases(rq1->lock)
- __releases(rq2->lock)
+ __releases(__rq_lockp(rq1), __rq_lockp(rq2))
{
if (__rq_lockp(rq1) != __rq_lockp(rq2))
raw_spin_rq_unlock(rq2);
else
- __release(rq2->lock);
+ __release(__rq_lockp(rq2)); /* fake release */
raw_spin_rq_unlock(rq1);
}
@@ -3337,11 +3392,11 @@ struct irqtime {
};
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
-extern int sched_clock_irqtime;
+DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime);
static inline int irqtime_enabled(void)
{
- return sched_clock_irqtime;
+ return static_branch_likely(&sched_clock_irqtime);
}
/*
@@ -3758,8 +3813,10 @@ static __always_inline void mm_unset_cid_on_task(struct task_struct *t)
static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp)
{
/* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */
- pcp->cid = cpu_cid_to_cid(pcp->cid);
- mm_drop_cid(mm, pcp->cid);
+ if (cid_on_cpu(pcp->cid)) {
+ pcp->cid = cpu_cid_to_cid(pcp->cid);
+ mm_drop_cid(mm, pcp->cid);
+ }
}
static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids)
@@ -3816,7 +3873,8 @@ static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigne
__this_cpu_write(mm->mm_cid.pcpu->cid, cid);
}
-static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid)
+static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid,
+ unsigned int mode)
{
unsigned int max_cids, tcid = t->mm_cid.cid;
struct mm_struct *mm = t->mm;
@@ -3841,12 +3899,17 @@ static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int
/* Still nothing, allocate a new one */
if (!cid_on_cpu(cpu_cid))
cpu_cid = cid_to_cpu_cid(mm_get_cid(mm));
+
+ /* Handle the transition mode flag if required */
+ if (mode & MM_CID_TRANSIT)
+ cpu_cid = cpu_cid_to_cid(cpu_cid) | MM_CID_TRANSIT;
}
mm_cid_update_pcpu_cid(mm, cpu_cid);
mm_cid_update_task_cid(t, cpu_cid);
}
-static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid)
+static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid,
+ unsigned int mode)
{
unsigned int max_cids, tcid = t->mm_cid.cid;
struct mm_struct *mm = t->mm;
@@ -3872,7 +3935,7 @@ static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int
if (!cid_on_task(tcid))
tcid = mm_get_cid(mm);
/* Set the transition mode flag if required */
- tcid |= READ_ONCE(mm->mm_cid.transit);
+ tcid |= mode & MM_CID_TRANSIT;
}
mm_cid_update_pcpu_cid(mm, tcid);
mm_cid_update_task_cid(t, tcid);
@@ -3881,26 +3944,46 @@ static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int
static __always_inline void mm_cid_schedin(struct task_struct *next)
{
struct mm_struct *mm = next->mm;
- unsigned int cpu_cid;
+ unsigned int cpu_cid, mode;
if (!next->mm_cid.active)
return;
cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid);
- if (likely(!READ_ONCE(mm->mm_cid.percpu)))
- mm_cid_from_task(next, cpu_cid);
+ mode = READ_ONCE(mm->mm_cid.mode);
+ if (likely(!cid_on_cpu(mode)))
+ mm_cid_from_task(next, cpu_cid, mode);
else
- mm_cid_from_cpu(next, cpu_cid);
+ mm_cid_from_cpu(next, cpu_cid, mode);
}
static __always_inline void mm_cid_schedout(struct task_struct *prev)
{
+ struct mm_struct *mm = prev->mm;
+ unsigned int mode, cid;
+
/* During mode transitions CIDs are temporary and need to be dropped */
if (likely(!cid_in_transit(prev->mm_cid.cid)))
return;
- mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid));
- prev->mm_cid.cid = MM_CID_UNSET;
+ mode = READ_ONCE(mm->mm_cid.mode);
+ cid = cid_from_transit_cid(prev->mm_cid.cid);
+
+ /*
+ * If transition mode is done, transfer ownership when the CID is
+ * within the convergence range to optimize the next schedule in.
+ */
+ if (!cid_in_transit(mode) && cid < READ_ONCE(mm->mm_cid.max_cids)) {
+ if (cid_on_cpu(mode))
+ cid = cid_to_cpu_cid(cid);
+
+ /* Update both so that the next schedule in goes into the fast path */
+ mm_cid_update_pcpu_cid(mm, cid);
+ prev->mm_cid.cid = cid;
+ } else {
+ mm_drop_cid(mm, cid);
+ prev->mm_cid.cid = MM_CID_UNSET;
+ }
}
static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next)
@@ -3924,6 +4007,7 @@ void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_s
deactivate_task(src_rq, task, 0);
set_task_cpu(task, dst_rq->cpu);
activate_task(dst_rq, task, 0);
+ wakeup_preempt(dst_rq, task, 0);
}
static inline
@@ -3993,6 +4077,7 @@ extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
struct sched_change_ctx {
u64 prio;
struct task_struct *p;
+ const struct sched_class *class;
int flags;
bool queued;
bool running;
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index c903f1a42891..a612cf253c87 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -253,8 +253,10 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
delta = rq_clock(rq) - t->sched_info.last_queued;
t->sched_info.last_queued = 0;
t->sched_info.run_delay += delta;
- if (delta > t->sched_info.max_run_delay)
+ if (delta > t->sched_info.max_run_delay) {
t->sched_info.max_run_delay = delta;
+ ktime_get_real_ts64(&t->sched_info.max_run_delay_ts);
+ }
if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
t->sched_info.min_run_delay = delta;
rq_sched_info_dequeue(rq, delta);
@@ -278,8 +280,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
t->sched_info.run_delay += delta;
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
- if (delta > t->sched_info.max_run_delay)
+ if (delta > t->sched_info.max_run_delay) {
t->sched_info.max_run_delay = delta;
+ ktime_get_real_ts64(&t->sched_info.max_run_delay_ts);
+ }
if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
t->sched_info.min_run_delay = delta;
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 4f9192be4b5b..f95798baddeb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *rq)
* Simple, special scheduling class for the per-CPU stop tasks:
*/
DEFINE_SCHED_CLASS(stop) = {
-
- .queue_mask = 16,
-
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index cf643a5ddedd..ac268da91778 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -508,6 +508,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (rq->fair_server.dl_server)
__dl_server_attach_root(&rq->fair_server, rq);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (rq->ext_server.dl_server)
+ __dl_server_attach_root(&rq->ext_server, rq);
+#endif
+
rq_unlock_irqrestore(rq, &rf);
if (old_rd)
diff --git a/kernel/signal.c b/kernel/signal.c
index e42b8bd6922f..d65d0fe24bfb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1355,8 +1355,8 @@ int zap_other_threads(struct task_struct *p)
return count;
}
-struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
- unsigned long *flags)
+struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
+ unsigned long *flags)
{
struct sighand_struct *sighand;
diff --git a/kernel/sys.c b/kernel/sys.c
index 8b58eece4e58..35ea9d79a42e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -31,7 +31,6 @@
#include <linux/tty.h>
#include <linux/signal.h>
#include <linux/cn_proc.h>
-#include <linux/getcpu.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/seccomp.h>
#include <linux/cpu.h>
@@ -53,6 +52,7 @@
#include <linux/time_namespace.h>
#include <linux/binfmts.h>
#include <linux/futex.h>
+#include <linux/rseq.h>
#include <linux/sched.h>
#include <linux/sched/autogroup.h>
@@ -2868,6 +2868,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_FUTEX_HASH:
error = futex_hash_prctl(arg2, arg3, arg4);
break;
+ case PR_RSEQ_SLICE_EXTENSION:
+ if (arg4 || arg5)
+ return -EINVAL;
+ error = rseq_slice_extension_prctl(arg2, arg3);
+ break;
default:
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
error = -EINVAL;
@@ -2876,8 +2881,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return error;
}
-SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
- struct getcpu_cache __user *, unused)
+SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, void __user *, unused)
{
int err = 0;
int cpu = raw_smp_processor_id();
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bf5d05c635ff..add3032da16f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -390,6 +390,7 @@ COND_SYSCALL(setuid16);
/* restartable sequence */
COND_SYSCALL(rseq);
+COND_SYSCALL(rseq_slice_yield);
COND_SYSCALL(uretprobe);
COND_SYSCALL(uprobe);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index a1890a073196..df7194961658 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -252,7 +252,7 @@ enum wd_read_status {
static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
{
- int64_t md = 2 * watchdog->uncertainty_margin;
+ int64_t md = watchdog->uncertainty_margin;
unsigned int nretries, max_retries;
int64_t wd_delay, wd_seq_delay;
u64 wd_end, wd_end2;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 0e4bc1ca15ff..860af7a58428 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -50,6 +50,14 @@
#include "tick-internal.h"
/*
+ * The resolution of the clocks. The resolution value is returned in
+ * the clock_getres() system call to give application programmers an
+ * idea of the (in)accuracy of timers. Timer values are rounded up to
+ * this resolution values.
+ */
+#define HIGH_RES_NSEC 1
+
+/*
* Masks for selecting the soft and hard context timers from
* cpu_base->active
*/
@@ -806,7 +814,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
struct hrtimer_clock_base *base = timer->base;
ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
- WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
+ WARN_ON_ONCE(hrtimer_get_expires(timer) < 0);
/*
* CLOCK_REALTIME timer might be requested with an absolute
@@ -943,7 +951,7 @@ void clock_was_set(unsigned int bases)
cpumask_var_t mask;
int cpu;
- if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active)
+ if (!hrtimer_hres_active(cpu_base) && !tick_nohz_is_active())
goto out_timerfd;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -1053,7 +1061,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
orun = ktime_divns(delta, incr);
hrtimer_add_expires_ns(timer, incr * orun);
- if (hrtimer_get_expires_tv64(timer) > now)
+ if (hrtimer_get_expires(timer) > now)
return orun;
/*
* This (and the ktime_add() below) is the
@@ -1742,7 +1750,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
lockdep_assert_held(&cpu_base->lock);
- debug_deactivate(timer);
+ debug_hrtimer_deactivate(timer);
base->running = timer;
/*
@@ -1835,7 +1843,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
* are right-of a not yet expired timer, because that
* timer will have to trigger a wakeup anyway.
*/
- if (basenow < hrtimer_get_softexpires_tv64(timer))
+ if (basenow < hrtimer_get_softexpires(timer))
break;
__run_hrtimer(cpu_base, base, timer, &basenow, flags);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 80a8a09a21a0..413e2389f0a5 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -66,14 +66,7 @@ static const struct k_clock clock_realtime, clock_monotonic;
#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
#endif
-static struct k_itimer *__lock_timer(timer_t timer_id);
-
-#define lock_timer(tid) \
-({ struct k_itimer *__timr; \
- __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid)); \
- __timr; \
-})
-
+static struct k_itimer *lock_timer(timer_t timer_id);
static inline void unlock_timer(struct k_itimer *timr)
{
if (likely((timr)))
@@ -85,7 +78,7 @@ static inline void unlock_timer(struct k_itimer *timr)
#define scoped_timer (scope)
-DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), __lock_timer(id), timer_t id);
+DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), lock_timer(id), timer_t id);
DEFINE_CLASS_IS_COND_GUARD(lock_timer);
static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr)
@@ -600,7 +593,7 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
}
#endif
-static struct k_itimer *__lock_timer(timer_t timer_id)
+static struct k_itimer *lock_timer(timer_t timer_id)
{
struct k_itimer *timr;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index f39111830ca3..f3aaef695b8c 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -215,7 +215,7 @@ void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
update_clock_read_data(&rd);
- if (sched_clock_timer.function != NULL) {
+ if (ACCESS_PRIVATE(&sched_clock_timer, function) != NULL) {
/* update timeout for clock wrap */
hrtimer_start(&sched_clock_timer, cd.wrap_kt,
HRTIMER_MODE_REL_HARD);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 4e4f7bbe2a64..597d816d22e8 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -156,7 +156,6 @@ static inline void tick_nohz_init(void) { }
#endif
#ifdef CONFIG_NO_HZ_COMMON
-extern unsigned long tick_nohz_active;
extern void timers_update_nohz(void);
extern u64 get_jiffies_update(unsigned long *basej);
# ifdef CONFIG_SMP
@@ -171,7 +170,6 @@ extern void timer_expire_remote(unsigned int cpu);
# endif
#else /* CONFIG_NO_HZ_COMMON */
static inline void timers_update_nohz(void) { }
-#define tick_nohz_active (0)
#endif
DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2f8a7923fa27..f7907fadd63f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -344,6 +344,9 @@ static bool check_tick_dependency(atomic_t *dep)
{
int val = atomic_read(dep);
+ if (likely(!tracepoint_enabled(tick_stop)))
+ return !val;
+
if (val & TICK_DEP_MASK_POSIX_TIMER) {
trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
return true;
@@ -693,7 +696,7 @@ void __init tick_nohz_init(void)
* NO HZ enabled ?
*/
bool tick_nohz_enabled __read_mostly = true;
-unsigned long tick_nohz_active __read_mostly;
+static unsigned long tick_nohz_active __read_mostly;
/*
* Enable / Disable tickless mode
*/
@@ -704,6 +707,12 @@ static int __init setup_tick_nohz(char *str)
__setup("nohz=", setup_tick_nohz);
+bool tick_nohz_is_active(void)
+{
+ return tick_nohz_active;
+}
+EXPORT_SYMBOL_GPL(tick_nohz_is_active);
+
bool tick_nohz_tick_stopped(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c
index 2889763165e5..1b99180da288 100644
--- a/kernel/time/time_test.c
+++ b/kernel/time/time_test.c
@@ -4,7 +4,9 @@
#include <linux/time.h>
/*
- * Traditional implementation of leap year evaluation.
+ * Traditional implementation of leap year evaluation, but note that long
+ * is a signed type and the tests do cover negative year values. So this
+ * can't use the is_leap_year() helper from rtc.h.
*/
static bool is_leap(long year)
{
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index 3d2a354cfe1c..2e64dbb6302d 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -62,38 +62,3 @@ u64 timecounter_read(struct timecounter *tc)
}
EXPORT_SYMBOL_GPL(timecounter_read);
-/*
- * This is like cyclecounter_cyc2ns(), but it is used for computing a
- * time previous to the time stored in the cycle counter.
- */
-static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
- u64 cycles, u64 mask, u64 frac)
-{
- u64 ns = (u64) cycles;
-
- ns = ((ns * cc->mult) - frac) >> cc->shift;
-
- return ns;
-}
-
-u64 timecounter_cyc2time(const struct timecounter *tc,
- u64 cycle_tstamp)
-{
- u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
- u64 nsec = tc->nsec, frac = tc->frac;
-
- /*
- * Instead of always treating cycle_tstamp as more recent
- * than tc->cycle_last, detect when it is too far in the
- * future and treat it as old time stamp instead.
- */
- if (delta > tc->cc->mask / 2) {
- delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
- nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac);
- } else {
- nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac);
- }
-
- return nsec;
-}
-EXPORT_SYMBOL_GPL(timecounter_cyc2time);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3ec3daa4acab..91fa2003351c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2735,7 +2735,7 @@ static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc,
timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET);
result->clock_set = true;
} else {
- tk_update_leap_state_all(&tk_core);
+ tk_update_leap_state_all(tkd);
}
/* Update the multiplier immediately if frequency was set directly */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 1f2364126894..7e1e3bde6b8b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -281,7 +281,7 @@ DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
static void timers_update_migration(void)
{
- if (sysctl_timer_migration && tick_nohz_active)
+ if (sysctl_timer_migration && tick_nohz_is_active())
static_branch_enable(&timers_migration_enabled);
else
static_branch_disable(&timers_migration_enabled);
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 18dda1aa782d..6da9cd562b20 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -466,9 +466,8 @@ static inline bool tmigr_is_isolated(int cpu)
{
if (!static_branch_unlikely(&tmigr_exclude_isolated))
return false;
- return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) ||
- cpuset_cpu_is_isolated(cpu)) &&
- housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE);
+ return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) &&
+ housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE));
}
/*
@@ -1497,7 +1496,7 @@ static int tmigr_clear_cpu_available(unsigned int cpu)
return 0;
}
-static int tmigr_set_cpu_available(unsigned int cpu)
+static int __tmigr_set_cpu_available(unsigned int cpu)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
@@ -1505,9 +1504,6 @@ static int tmigr_set_cpu_available(unsigned int cpu)
if (WARN_ON_ONCE(!tmc->tmgroup))
return -EINVAL;
- if (tmigr_is_isolated(cpu))
- return 0;
-
guard(mutex)(&tmigr_available_mutex);
cpumask_set_cpu(cpu, tmigr_available_cpumask);
@@ -1523,6 +1519,14 @@ static int tmigr_set_cpu_available(unsigned int cpu)
return 0;
}
+static int tmigr_set_cpu_available(unsigned int cpu)
+{
+ if (tmigr_is_isolated(cpu))
+ return 0;
+
+ return __tmigr_set_cpu_available(cpu);
+}
+
static void tmigr_cpu_isolate(struct work_struct *ignored)
{
tmigr_clear_cpu_available(smp_processor_id());
@@ -1530,7 +1534,12 @@ static void tmigr_cpu_isolate(struct work_struct *ignored)
static void tmigr_cpu_unisolate(struct work_struct *ignored)
{
- tmigr_set_cpu_available(smp_processor_id());
+ /*
+ * Don't call tmigr_is_isolated() ->housekeeping_cpu() directly because
+ * the cpuset mutex is correctly held by the workqueue caller but lockdep
+ * doesn't know that.
+ */
+ __tmigr_set_cpu_available(smp_processor_id());
}
/**
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index bfa2ec46e075..d7042a09fe46 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -50,6 +50,9 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS
config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
bool
+config HAVE_SINGLE_FTRACE_DIRECT_OPS
+ bool
+
config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
bool
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d031c8d80be4..c4db5c2e7103 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -793,7 +793,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
return PTR_ERR(bt);
}
blk_trace_setup_finalize(q, name, 1, bt, &buts2);
- strcpy(buts.name, buts2.name);
+ strscpy(buts.name, buts2.name, BLKTRACE_BDEV_SIZE);
mutex_unlock(&q->debugfs_mutex);
if (copy_to_user(arg, &buts, sizeof(buts))) {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index fe28d86f7c35..f7baeb8278ca 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -830,7 +830,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struc
info.si_code = SI_KERNEL;
info.si_pid = 0;
info.si_uid = 0;
- info.si_value.sival_ptr = (void *)(unsigned long)value;
+ info.si_value.sival_ptr = (void __user __force *)(unsigned long)value;
siginfo = &info;
}
@@ -1022,7 +1022,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
.func = bpf_snprintf_btf,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_WRITE,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE,
@@ -1194,7 +1194,7 @@ const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value)
{
/* This helper call is inlined by verifier. */
- u64 nr_args = ((u64 *)ctx)[-1];
+ u64 nr_args = ((u64 *)ctx)[-1] & 0xFF;
if ((u64) n >= nr_args)
return -EINVAL;
@@ -1214,7 +1214,7 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = {
BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
{
/* This helper call is inlined by verifier. */
- u64 nr_args = ((u64 *)ctx)[-1];
+ u64 nr_args = ((u64 *)ctx)[-1] & 0xFF;
*value = ((u64 *)ctx)[nr_args];
return 0;
@@ -1231,7 +1231,7 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = {
BPF_CALL_1(get_func_arg_cnt, void *, ctx)
{
/* This helper call is inlined by verifier. */
- return ((u64 *)ctx)[-1];
+ return ((u64 *)ctx)[-1] & 0xFF;
}
static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
@@ -1286,7 +1286,8 @@ static bool is_kprobe_multi(const struct bpf_prog *prog)
static inline bool is_kprobe_session(const struct bpf_prog *prog)
{
- return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
+ return prog->type == BPF_PROG_TYPE_KPROBE &&
+ prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
}
static inline bool is_uprobe_multi(const struct bpf_prog *prog)
@@ -1297,7 +1298,14 @@ static inline bool is_uprobe_multi(const struct bpf_prog *prog)
static inline bool is_uprobe_session(const struct bpf_prog *prog)
{
- return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
+ return prog->type == BPF_PROG_TYPE_KPROBE &&
+ prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
+}
+
+static inline bool is_trace_fsession(const struct bpf_prog *prog)
+{
+ return prog->type == BPF_PROG_TYPE_TRACING &&
+ prog->expected_attach_type == BPF_TRACE_FSESSION;
}
static const struct bpf_func_proto *
@@ -1526,7 +1534,7 @@ static const struct bpf_func_proto bpf_read_branch_records_proto = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg2_type = ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
@@ -1661,7 +1669,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
@@ -1734,11 +1742,17 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_d_path:
return &bpf_d_path_proto;
case BPF_FUNC_get_func_arg:
- return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL;
+ if (bpf_prog_has_trampoline(prog) ||
+ prog->expected_attach_type == BPF_TRACE_RAW_TP)
+ return &bpf_get_func_arg_proto;
+ return NULL;
case BPF_FUNC_get_func_ret:
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL;
case BPF_FUNC_get_func_arg_cnt:
- return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
+ if (bpf_prog_has_trampoline(prog) ||
+ prog->expected_attach_type == BPF_TRACE_RAW_TP)
+ return &bpf_get_func_arg_cnt_proto;
+ return NULL;
case BPF_FUNC_get_attach_cookie:
if (prog->type == BPF_PROG_TYPE_TRACING &&
prog->expected_attach_type == BPF_TRACE_RAW_TP)
@@ -2063,7 +2077,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
struct bpf_trace_run_ctx run_ctx;
cant_sleep();
- if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ if (unlikely(!bpf_prog_get_recursion_context(prog))) {
bpf_prog_inc_misses_counter(prog);
goto out;
}
@@ -2077,7 +2091,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
bpf_reset_run_ctx(old_run_ctx);
out:
- this_cpu_dec(*(prog->active));
+ bpf_prog_put_recursion_context(prog);
}
#define UNPACK(...) __VA_ARGS__
@@ -2564,6 +2578,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
err = bpf_prog_run(link->link.prog, regs);
bpf_reset_run_ctx(old_run_ctx);
+ ftrace_partial_regs_update(fregs, bpf_kprobe_multi_pt_regs_ptr());
rcu_read_unlock();
out:
@@ -3316,7 +3331,7 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
__bpf_kfunc_start_defs();
-__bpf_kfunc bool bpf_session_is_return(void)
+__bpf_kfunc bool bpf_session_is_return(void *ctx)
{
struct bpf_session_run_ctx *session_ctx;
@@ -3324,7 +3339,7 @@ __bpf_kfunc bool bpf_session_is_return(void)
return session_ctx->is_return;
}
-__bpf_kfunc __u64 *bpf_session_cookie(void)
+__bpf_kfunc __u64 *bpf_session_cookie(void *ctx)
{
struct bpf_session_run_ctx *session_ctx;
@@ -3334,34 +3349,39 @@ __bpf_kfunc __u64 *bpf_session_cookie(void)
__bpf_kfunc_end_defs();
-BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids)
+BTF_KFUNCS_START(session_kfunc_set_ids)
BTF_ID_FLAGS(func, bpf_session_is_return)
BTF_ID_FLAGS(func, bpf_session_cookie)
-BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids)
+BTF_KFUNCS_END(session_kfunc_set_ids)
-static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id)
+static int bpf_session_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
- if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id))
+ if (!btf_id_set8_contains(&session_kfunc_set_ids, kfunc_id))
return 0;
- if (!is_kprobe_session(prog) && !is_uprobe_session(prog))
+ if (!is_kprobe_session(prog) && !is_uprobe_session(prog) && !is_trace_fsession(prog))
return -EACCES;
return 0;
}
-static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = {
+static const struct btf_kfunc_id_set bpf_session_kfunc_set = {
.owner = THIS_MODULE,
- .set = &kprobe_multi_kfunc_set_ids,
- .filter = bpf_kprobe_multi_filter,
+ .set = &session_kfunc_set_ids,
+ .filter = bpf_session_filter,
};
-static int __init bpf_kprobe_multi_kfuncs_init(void)
+static int __init bpf_trace_kfuncs_init(void)
{
- return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set);
+ int err = 0;
+
+ err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_session_kfunc_set);
+ err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_session_kfunc_set);
+
+ return err;
}
-late_initcall(bpf_kprobe_multi_kfuncs_init);
+late_initcall(bpf_trace_kfuncs_init);
typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk);
@@ -3517,7 +3537,7 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid
__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
- return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
+ return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_data_nofault, NULL);
}
@@ -3531,7 +3551,7 @@ __bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off,
__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
- return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_str_nofault, NULL);
}
@@ -3545,14 +3565,14 @@ __bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 of
__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
- return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
+ return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_data_sleepable, NULL);
}
__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
- return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_str_sleepable, NULL);
}
@@ -3560,7 +3580,7 @@ __bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign,
struct task_struct *tsk)
{
- return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
+ return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_data_sleepable, tsk);
}
@@ -3568,7 +3588,7 @@ __bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64
u64 size, const void __user *unsafe_ptr__ign,
struct task_struct *tsk)
{
- return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_str_sleepable, tsk);
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index aa758efc3731..8fb38722fd5c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -68,7 +68,6 @@
})
/* hash bits for specific function selection */
-#define FTRACE_HASH_DEFAULT_BITS 10
#define FTRACE_HASH_MAX_BITS 12
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -1210,8 +1209,8 @@ static void __add_hash_entry(struct ftrace_hash *hash,
hash->count++;
}
-static struct ftrace_func_entry *
-add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+struct ftrace_func_entry *
+add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct)
{
struct ftrace_func_entry *entry;
@@ -1220,11 +1219,18 @@ add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
return NULL;
entry->ip = ip;
+ entry->direct = direct;
__add_hash_entry(hash, entry);
return entry;
}
+static struct ftrace_func_entry *
+add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+{
+ return add_ftrace_hash_entry_direct(hash, ip, 0);
+}
+
static void
free_hash_entry(struct ftrace_hash *hash,
struct ftrace_func_entry *entry)
@@ -1283,7 +1289,7 @@ static void clear_ftrace_mod_list(struct list_head *head)
mutex_unlock(&ftrace_lock);
}
-static void free_ftrace_hash(struct ftrace_hash *hash)
+void free_ftrace_hash(struct ftrace_hash *hash)
{
if (!hash || hash == EMPTY_HASH)
return;
@@ -1323,7 +1329,7 @@ void ftrace_free_filter(struct ftrace_ops *ops)
}
EXPORT_SYMBOL_GPL(ftrace_free_filter);
-static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
+struct ftrace_hash *alloc_ftrace_hash(int size_bits)
{
struct ftrace_hash *hash;
int size;
@@ -1397,7 +1403,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
- if (add_hash_entry(new_hash, entry->ip) == NULL)
+ if (add_ftrace_hash_entry_direct(new_hash, entry->ip, entry->direct) == NULL)
goto free_hash;
}
}
@@ -2068,7 +2074,7 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
*/
if (!ops->ops_func)
return -EBUSY;
- ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF);
+ ret = ops->ops_func(ops, rec->ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF);
if (ret)
return ret;
} else if (is_ipmodify) {
@@ -2624,8 +2630,13 @@ unsigned long ftrace_find_rec_direct(unsigned long ip)
static void call_direct_funcs(unsigned long ip, unsigned long pip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
- unsigned long addr = READ_ONCE(ops->direct_call);
+ unsigned long addr;
+#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS
+ addr = ftrace_find_rec_direct(ip);
+#else
+ addr = READ_ONCE(ops->direct_call);
+#endif
if (!addr)
return;
@@ -6049,15 +6060,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
if (ftrace_hash_empty(hash))
return -EINVAL;
- /* This is a "raw" address, and this should never happen. */
- if (WARN_ON_ONCE(ftrace_is_jmp(addr)))
- return -EINVAL;
-
mutex_lock(&direct_mutex);
- if (ops->flags & FTRACE_OPS_FL_JMP)
- addr = ftrace_jmp_set(addr);
-
/* Make sure requested entries are not already registered.. */
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
@@ -6178,13 +6182,6 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
lockdep_assert_held_once(&direct_mutex);
- /* This is a "raw" address, and this should never happen. */
- if (WARN_ON_ONCE(ftrace_is_jmp(addr)))
- return -EINVAL;
-
- if (ops->flags & FTRACE_OPS_FL_JMP)
- addr = ftrace_jmp_set(addr);
-
/* Enable the tmp_ops to have the same functions as the direct ops */
ftrace_ops_init(&tmp_ops);
tmp_ops.func_hash = ops->func_hash;
@@ -6289,6 +6286,368 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
return err;
}
EXPORT_SYMBOL_GPL(modify_ftrace_direct);
+
+static unsigned long hash_count(struct ftrace_hash *hash)
+{
+ return hash ? hash->count : 0;
+}
+
+/**
+ * hash_add - adds two struct ftrace_hash and returns the result
+ * @a: struct ftrace_hash object
+ * @b: struct ftrace_hash object
+ *
+ * Returns struct ftrace_hash object on success, NULL on error.
+ */
+static struct ftrace_hash *hash_add(struct ftrace_hash *a, struct ftrace_hash *b)
+{
+ struct ftrace_func_entry *entry;
+ struct ftrace_hash *add;
+ int size;
+
+ size = hash_count(a) + hash_count(b);
+ if (size > 32)
+ size = 32;
+
+ add = alloc_and_copy_ftrace_hash(fls(size), a);
+ if (!add)
+ return NULL;
+
+ size = 1 << b->size_bits;
+ for (int i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &b->buckets[i], hlist) {
+ if (add_ftrace_hash_entry_direct(add, entry->ip, entry->direct) == NULL) {
+ free_ftrace_hash(add);
+ return NULL;
+ }
+ }
+ }
+ return add;
+}
+
+/**
+ * update_ftrace_direct_add - Updates @ops by adding direct
+ * callers provided in @hash
+ * @ops: The address of the struct ftrace_ops object
+ * @hash: The address of the struct ftrace_hash object
+ *
+ * This is used to add custom direct callers (ip -> addr) to @ops,
+ * specified in @hash. The @ops will be either registered or updated.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -EINVAL - The @hash is empty
+ */
+int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash)
+{
+ struct ftrace_hash *old_direct_functions = NULL;
+ struct ftrace_hash *new_direct_functions;
+ struct ftrace_hash *old_filter_hash;
+ struct ftrace_hash *new_filter_hash = NULL;
+ struct ftrace_func_entry *entry;
+ int err = -EINVAL;
+ int size;
+ bool reg;
+
+ if (!hash_count(hash))
+ return -EINVAL;
+
+ mutex_lock(&direct_mutex);
+
+ /* Make sure requested entries are not already registered. */
+ size = 1 << hash->size_bits;
+ for (int i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ if (__ftrace_lookup_ip(direct_functions, entry->ip))
+ goto out_unlock;
+ }
+ }
+
+ old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL;
+
+ /* If there's nothing in filter_hash we need to register the ops. */
+ reg = hash_count(old_filter_hash) == 0;
+ if (reg) {
+ if (ops->func || ops->trampoline)
+ goto out_unlock;
+ if (ops->flags & FTRACE_OPS_FL_ENABLED)
+ goto out_unlock;
+ }
+
+ err = -ENOMEM;
+ new_filter_hash = hash_add(old_filter_hash, hash);
+ if (!new_filter_hash)
+ goto out_unlock;
+
+ new_direct_functions = hash_add(direct_functions, hash);
+ if (!new_direct_functions)
+ goto out_unlock;
+
+ old_direct_functions = direct_functions;
+ rcu_assign_pointer(direct_functions, new_direct_functions);
+
+ if (reg) {
+ ops->func = call_direct_funcs;
+ ops->flags |= MULTI_FLAGS;
+ ops->trampoline = FTRACE_REGS_ADDR;
+ ops->local_hash.filter_hash = new_filter_hash;
+
+ err = register_ftrace_function_nolock(ops);
+ if (err) {
+ /* restore old filter on error */
+ ops->local_hash.filter_hash = old_filter_hash;
+
+ /* cleanup for possible another register call */
+ ops->func = NULL;
+ ops->trampoline = 0;
+ } else {
+ new_filter_hash = old_filter_hash;
+ }
+ } else {
+ err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH);
+ /*
+ * new_filter_hash is dup-ed, so we need to release it anyway,
+ * old_filter_hash either stays on error or is already released
+ */
+ }
+
+ if (err) {
+ /* reset direct_functions and free the new one */
+ rcu_assign_pointer(direct_functions, old_direct_functions);
+ old_direct_functions = new_direct_functions;
+ }
+
+ out_unlock:
+ mutex_unlock(&direct_mutex);
+
+ if (old_direct_functions && old_direct_functions != EMPTY_HASH)
+ call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb);
+ free_ftrace_hash(new_filter_hash);
+
+ return err;
+}
+
+/**
+ * hash_sub - substracts @b from @a and returns the result
+ * @a: struct ftrace_hash object
+ * @b: struct ftrace_hash object
+ *
+ * Returns struct ftrace_hash object on success, NULL on error.
+ */
+static struct ftrace_hash *hash_sub(struct ftrace_hash *a, struct ftrace_hash *b)
+{
+ struct ftrace_func_entry *entry, *del;
+ struct ftrace_hash *sub;
+ int size;
+
+ sub = alloc_and_copy_ftrace_hash(a->size_bits, a);
+ if (!sub)
+ return NULL;
+
+ size = 1 << b->size_bits;
+ for (int i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &b->buckets[i], hlist) {
+ del = __ftrace_lookup_ip(sub, entry->ip);
+ if (WARN_ON_ONCE(!del)) {
+ free_ftrace_hash(sub);
+ return NULL;
+ }
+ remove_hash_entry(sub, del);
+ kfree(del);
+ }
+ }
+ return sub;
+}
+
+/**
+ * update_ftrace_direct_del - Updates @ops by removing its direct
+ * callers provided in @hash
+ * @ops: The address of the struct ftrace_ops object
+ * @hash: The address of the struct ftrace_hash object
+ *
+ * This is used to delete custom direct callers (ip -> addr) in
+ * @ops specified via @hash. The @ops will be either unregistered
+ * updated.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -EINVAL - The @hash is empty
+ * -EINVAL - The @ops is not registered
+ */
+int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash)
+{
+ struct ftrace_hash *old_direct_functions = NULL;
+ struct ftrace_hash *new_direct_functions;
+ struct ftrace_hash *new_filter_hash = NULL;
+ struct ftrace_hash *old_filter_hash;
+ struct ftrace_func_entry *entry;
+ struct ftrace_func_entry *del;
+ unsigned long size;
+ int err = -EINVAL;
+
+ if (!hash_count(hash))
+ return -EINVAL;
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
+ if (direct_functions == EMPTY_HASH)
+ return -EINVAL;
+
+ mutex_lock(&direct_mutex);
+
+ old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL;
+
+ if (!hash_count(old_filter_hash))
+ goto out_unlock;
+
+ /* Make sure requested entries are already registered. */
+ size = 1 << hash->size_bits;
+ for (int i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ del = __ftrace_lookup_ip(direct_functions, entry->ip);
+ if (!del || del->direct != entry->direct)
+ goto out_unlock;
+ }
+ }
+
+ err = -ENOMEM;
+ new_filter_hash = hash_sub(old_filter_hash, hash);
+ if (!new_filter_hash)
+ goto out_unlock;
+
+ new_direct_functions = hash_sub(direct_functions, hash);
+ if (!new_direct_functions)
+ goto out_unlock;
+
+ /* If there's nothing left, we need to unregister the ops. */
+ if (ftrace_hash_empty(new_filter_hash)) {
+ err = unregister_ftrace_function(ops);
+ if (!err) {
+ /* cleanup for possible another register call */
+ ops->func = NULL;
+ ops->trampoline = 0;
+ ftrace_free_filter(ops);
+ ops->func_hash->filter_hash = NULL;
+ }
+ } else {
+ err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH);
+ /*
+ * new_filter_hash is dup-ed, so we need to release it anyway,
+ * old_filter_hash either stays on error or is already released
+ */
+ }
+
+ if (err) {
+ /* free the new_direct_functions */
+ old_direct_functions = new_direct_functions;
+ } else {
+ old_direct_functions = direct_functions;
+ rcu_assign_pointer(direct_functions, new_direct_functions);
+ }
+
+ out_unlock:
+ mutex_unlock(&direct_mutex);
+
+ if (old_direct_functions && old_direct_functions != EMPTY_HASH)
+ call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb);
+ free_ftrace_hash(new_filter_hash);
+
+ return err;
+}
+
+/**
+ * update_ftrace_direct_mod - Updates @ops by modifing its direct
+ * callers provided in @hash
+ * @ops: The address of the struct ftrace_ops object
+ * @hash: The address of the struct ftrace_hash object
+ * @do_direct_lock: If true lock the direct_mutex
+ *
+ * This is used to modify custom direct callers (ip -> addr) in
+ * @ops specified via @hash.
+ *
+ * This can be called from within ftrace ops_func callback with
+ * direct_mutex already locked, in which case @do_direct_lock
+ * needs to be false.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -EINVAL - The @hash is empty
+ * -EINVAL - The @ops is not registered
+ */
+int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock)
+{
+ struct ftrace_func_entry *entry, *tmp;
+ static struct ftrace_ops tmp_ops = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_STUB,
+ };
+ struct ftrace_hash *orig_hash;
+ unsigned long size, i;
+ int err = -EINVAL;
+
+ if (!hash_count(hash))
+ return -EINVAL;
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
+ if (direct_functions == EMPTY_HASH)
+ return -EINVAL;
+
+ /*
+ * We can be called from within ops_func callback with direct_mutex
+ * already taken.
+ */
+ if (do_direct_lock)
+ mutex_lock(&direct_mutex);
+
+ orig_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL;
+ if (!orig_hash)
+ goto unlock;
+
+ /* Enable the tmp_ops to have the same functions as the direct ops */
+ ftrace_ops_init(&tmp_ops);
+ tmp_ops.func_hash = ops->func_hash;
+
+ err = register_ftrace_function_nolock(&tmp_ops);
+ if (err)
+ goto unlock;
+
+ /*
+ * Call __ftrace_hash_update_ipmodify() here, so that we can call
+ * ops->ops_func for the ops. This is needed because the above
+ * register_ftrace_function_nolock() worked on tmp_ops.
+ */
+ err = __ftrace_hash_update_ipmodify(ops, orig_hash, orig_hash, true);
+ if (err)
+ goto out;
+
+ /*
+ * Now the ftrace_ops_list_func() is called to do the direct callers.
+ * We can safely change the direct functions attached to each entry.
+ */
+ mutex_lock(&ftrace_lock);
+
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ tmp = __ftrace_lookup_ip(direct_functions, entry->ip);
+ if (!tmp)
+ continue;
+ tmp->direct = entry->direct;
+ }
+ }
+
+ mutex_unlock(&ftrace_lock);
+
+out:
+ /* Removing the tmp_ops will add the updated direct callers to the functions */
+ unregister_ftrace_function(&tmp_ops);
+
+unlock:
+ if (do_direct_lock)
+ mutex_unlock(&direct_mutex);
+ return err;
+}
+
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
/**
@@ -7753,7 +8112,8 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
int
ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
- unsigned long *off, char **modname, char *sym)
+ unsigned long *off, char **modname,
+ const unsigned char **modbuildid, char *sym)
{
struct ftrace_mod_map *mod_map;
int ret = 0;
@@ -7765,6 +8125,8 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
if (ret) {
if (modname)
*modname = mod_map->mod->name;
+ if (modbuildid)
+ *modbuildid = module_buildid(mod_map->mod);
break;
}
}
@@ -8709,7 +9071,7 @@ static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops)
if (!op->ops_func)
return -EBUSY;
- ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER);
+ ret = op->ops_func(op, ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER);
if (ret)
return ret;
}
@@ -8756,7 +9118,7 @@ static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops)
/* The cleanup is optional, ignore any errors */
if (found_op && op->ops_func)
- op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER);
+ op->ops_func(op, ip, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER);
}
}
mutex_unlock(&direct_mutex);
diff --git a/kernel/trace/rv/monitors/nrp/nrp.c b/kernel/trace/rv/monitors/nrp/nrp.c
index 5a83b7171432..4b5646a70094 100644
--- a/kernel/trace/rv/monitors/nrp/nrp.c
+++ b/kernel/trace/rv/monitors/nrp/nrp.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "nrp"
@@ -15,17 +14,16 @@
#include <rv_trace.h>
#include <monitors/sched/sched.h>
+#define RV_MON_TYPE RV_MON_PER_TASK
#include "nrp.h"
-
-static struct rv_monitor rv_nrp;
-DECLARE_DA_MON_PER_TASK(nrp, unsigned char);
+#include <rv/da_monitor.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/trace/irq_vectors.h>
static void handle_vector_irq_entry(void *data, int vector)
{
- da_handle_event_nrp(current, irq_entry_nrp);
+ da_handle_event(current, irq_entry_nrp);
}
static void attach_vector_irq(void)
@@ -60,7 +58,7 @@ static void detach_vector_irq(void) { }
static void handle_irq_entry(void *data, int irq, struct irqaction *action)
{
- da_handle_event_nrp(current, irq_entry_nrp);
+ da_handle_event(current, irq_entry_nrp);
}
static void handle_sched_need_resched(void *data, struct task_struct *tsk,
@@ -72,22 +70,22 @@ static void handle_sched_need_resched(void *data, struct task_struct *tsk,
* which may not mirror the system state but makes the monitor simpler,
*/
if (tif == TIF_NEED_RESCHED)
- da_handle_start_event_nrp(tsk, sched_need_resched_nrp);
+ da_handle_start_event(tsk, sched_need_resched_nrp);
}
static void handle_schedule_entry(void *data, bool preempt)
{
if (preempt)
- da_handle_event_nrp(current, schedule_entry_preempt_nrp);
+ da_handle_event(current, schedule_entry_preempt_nrp);
else
- da_handle_event_nrp(current, schedule_entry_nrp);
+ da_handle_event(current, schedule_entry_nrp);
}
static int enable_nrp(void)
{
int retval;
- retval = da_monitor_init_nrp();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -101,33 +99,33 @@ static int enable_nrp(void)
static void disable_nrp(void)
{
- rv_nrp.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("nrp", irq_handler_entry, handle_irq_entry);
rv_detach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched);
rv_detach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry);
detach_vector_irq();
- da_monitor_destroy_nrp();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_nrp = {
+static struct rv_monitor rv_this = {
.name = "nrp",
.description = "need resched preempts.",
.enable = enable_nrp,
.disable = disable_nrp,
- .reset = da_monitor_reset_all_nrp,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_nrp(void)
{
- return rv_register_monitor(&rv_nrp, &rv_sched);
+ return rv_register_monitor(&rv_this, &rv_sched);
}
static void __exit unregister_nrp(void)
{
- rv_unregister_monitor(&rv_nrp);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_nrp);
diff --git a/kernel/trace/rv/monitors/nrp/nrp.h b/kernel/trace/rv/monitors/nrp/nrp.h
index c9f12207cbf6..3270d4c0139f 100644
--- a/kernel/trace/rv/monitors/nrp/nrp.h
+++ b/kernel/trace/rv/monitors/nrp/nrp.h
@@ -5,22 +5,24 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME nrp
+
enum states_nrp {
- preempt_irq_nrp = 0,
+ preempt_irq_nrp,
any_thread_running_nrp,
nested_preempt_nrp,
rescheduling_nrp,
- state_max_nrp
+ state_max_nrp,
};
#define INVALID_STATE state_max_nrp
enum events_nrp {
- irq_entry_nrp = 0,
+ irq_entry_nrp,
sched_need_resched_nrp,
schedule_entry_nrp,
schedule_entry_preempt_nrp,
- event_max_nrp
+ event_max_nrp,
};
struct automaton_nrp {
@@ -36,38 +38,38 @@ static const struct automaton_nrp automaton_nrp = {
"preempt_irq",
"any_thread_running",
"nested_preempt",
- "rescheduling"
+ "rescheduling",
},
.event_names = {
"irq_entry",
"sched_need_resched",
"schedule_entry",
- "schedule_entry_preempt"
+ "schedule_entry_preempt",
},
.function = {
{
preempt_irq_nrp,
preempt_irq_nrp,
nested_preempt_nrp,
- nested_preempt_nrp
+ nested_preempt_nrp,
},
{
any_thread_running_nrp,
rescheduling_nrp,
any_thread_running_nrp,
- INVALID_STATE
+ INVALID_STATE,
},
{
nested_preempt_nrp,
preempt_irq_nrp,
any_thread_running_nrp,
- any_thread_running_nrp
+ any_thread_running_nrp,
},
{
preempt_irq_nrp,
rescheduling_nrp,
any_thread_running_nrp,
- any_thread_running_nrp
+ any_thread_running_nrp,
},
},
.initial_state = preempt_irq_nrp,
diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c
index 50d64e7fb8c4..25a40e90fa40 100644
--- a/kernel/trace/rv/monitors/opid/opid.c
+++ b/kernel/trace/rv/monitors/opid/opid.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "opid"
@@ -16,17 +15,16 @@
#include <rv_trace.h>
#include <monitors/sched/sched.h>
+#define RV_MON_TYPE RV_MON_PER_CPU
#include "opid.h"
-
-static struct rv_monitor rv_opid;
-DECLARE_DA_MON_PER_CPU(opid, unsigned char);
+#include <rv/da_monitor.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/trace/irq_vectors.h>
static void handle_vector_irq_entry(void *data, int vector)
{
- da_handle_event_opid(irq_entry_opid);
+ da_handle_event(irq_entry_opid);
}
static void attach_vector_irq(void)
@@ -61,52 +59,52 @@ static void detach_vector_irq(void) { }
static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_event_opid(irq_disable_opid);
+ da_handle_event(irq_disable_opid);
}
static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_event_opid(irq_enable_opid);
+ da_handle_event(irq_enable_opid);
}
static void handle_irq_entry(void *data, int irq, struct irqaction *action)
{
- da_handle_event_opid(irq_entry_opid);
+ da_handle_event(irq_entry_opid);
}
static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_event_opid(preempt_disable_opid);
+ da_handle_event(preempt_disable_opid);
}
static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_event_opid(preempt_enable_opid);
+ da_handle_event(preempt_enable_opid);
}
static void handle_sched_need_resched(void *data, struct task_struct *tsk, int cpu, int tif)
{
/* The monitor's intitial state is not in_irq */
if (this_cpu_read(hardirq_context))
- da_handle_event_opid(sched_need_resched_opid);
+ da_handle_event(sched_need_resched_opid);
else
- da_handle_start_event_opid(sched_need_resched_opid);
+ da_handle_start_event(sched_need_resched_opid);
}
static void handle_sched_waking(void *data, struct task_struct *p)
{
/* The monitor's intitial state is not in_irq */
if (this_cpu_read(hardirq_context))
- da_handle_event_opid(sched_waking_opid);
+ da_handle_event(sched_waking_opid);
else
- da_handle_start_event_opid(sched_waking_opid);
+ da_handle_start_event(sched_waking_opid);
}
static int enable_opid(void)
{
int retval;
- retval = da_monitor_init_opid();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -124,7 +122,7 @@ static int enable_opid(void)
static void disable_opid(void)
{
- rv_opid.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("opid", irq_disable, handle_irq_disable);
rv_detach_trace_probe("opid", irq_enable, handle_irq_enable);
@@ -135,29 +133,29 @@ static void disable_opid(void)
rv_detach_trace_probe("opid", sched_waking, handle_sched_waking);
detach_vector_irq();
- da_monitor_destroy_opid();
+ da_monitor_destroy();
}
/*
* This is the monitor register section.
*/
-static struct rv_monitor rv_opid = {
+static struct rv_monitor rv_this = {
.name = "opid",
.description = "operations with preemption and irq disabled.",
.enable = enable_opid,
.disable = disable_opid,
- .reset = da_monitor_reset_all_opid,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_opid(void)
{
- return rv_register_monitor(&rv_opid, &rv_sched);
+ return rv_register_monitor(&rv_this, &rv_sched);
}
static void __exit unregister_opid(void)
{
- rv_unregister_monitor(&rv_opid);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_opid);
diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h
index b4b8c2ff7f64..092992514970 100644
--- a/kernel/trace/rv/monitors/opid/opid.h
+++ b/kernel/trace/rv/monitors/opid/opid.h
@@ -5,26 +5,28 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME opid
+
enum states_opid {
- disabled_opid = 0,
+ disabled_opid,
enabled_opid,
in_irq_opid,
irq_disabled_opid,
preempt_disabled_opid,
- state_max_opid
+ state_max_opid,
};
#define INVALID_STATE state_max_opid
enum events_opid {
- irq_disable_opid = 0,
+ irq_disable_opid,
irq_enable_opid,
irq_entry_opid,
preempt_disable_opid,
preempt_enable_opid,
sched_need_resched_opid,
sched_waking_opid,
- event_max_opid
+ event_max_opid,
};
struct automaton_opid {
@@ -41,7 +43,7 @@ static const struct automaton_opid automaton_opid = {
"enabled",
"in_irq",
"irq_disabled",
- "preempt_disabled"
+ "preempt_disabled",
},
.event_names = {
"irq_disable",
@@ -50,7 +52,7 @@ static const struct automaton_opid automaton_opid = {
"preempt_disable",
"preempt_enable",
"sched_need_resched",
- "sched_waking"
+ "sched_waking",
},
.function = {
{
@@ -60,7 +62,7 @@ static const struct automaton_opid automaton_opid = {
INVALID_STATE,
irq_disabled_opid,
disabled_opid,
- disabled_opid
+ disabled_opid,
},
{
irq_disabled_opid,
@@ -69,7 +71,7 @@ static const struct automaton_opid automaton_opid = {
preempt_disabled_opid,
enabled_opid,
INVALID_STATE,
- INVALID_STATE
+ INVALID_STATE,
},
{
INVALID_STATE,
@@ -78,7 +80,7 @@ static const struct automaton_opid automaton_opid = {
INVALID_STATE,
INVALID_STATE,
in_irq_opid,
- in_irq_opid
+ in_irq_opid,
},
{
INVALID_STATE,
@@ -87,7 +89,7 @@ static const struct automaton_opid automaton_opid = {
disabled_opid,
INVALID_STATE,
irq_disabled_opid,
- INVALID_STATE
+ INVALID_STATE,
},
{
disabled_opid,
@@ -96,7 +98,7 @@ static const struct automaton_opid automaton_opid = {
INVALID_STATE,
enabled_opid,
INVALID_STATE,
- INVALID_STATE
+ INVALID_STATE,
},
},
.initial_state = disabled_opid,
diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.c b/kernel/trace/rv/monitors/rtapp/rtapp.c
index fd75fc927d65..17f271231c99 100644
--- a/kernel/trace/rv/monitors/rtapp/rtapp.c
+++ b/kernel/trace/rv/monitors/rtapp/rtapp.c
@@ -8,8 +8,6 @@
#include "rtapp.h"
-struct rv_monitor rv_rtapp;
-
struct rv_monitor rv_rtapp = {
.name = "rtapp",
.description = "Collection of monitors for detecting problems with real-time applications",
diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c
index d04db4b543f9..dd9d96fc6e21 100644
--- a/kernel/trace/rv/monitors/sched/sched.c
+++ b/kernel/trace/rv/monitors/sched/sched.c
@@ -8,8 +8,6 @@
#include "sched.h"
-struct rv_monitor rv_sched;
-
struct rv_monitor rv_sched = {
.name = "sched",
.description = "container for several scheduler monitor specifications.",
diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c
index 04c36405e2e3..5a3bd5e16e62 100644
--- a/kernel/trace/rv/monitors/sco/sco.c
+++ b/kernel/trace/rv/monitors/sco/sco.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "sco"
@@ -14,31 +13,30 @@
#include <rv_trace.h>
#include <monitors/sched/sched.h>
+#define RV_MON_TYPE RV_MON_PER_CPU
#include "sco.h"
-
-static struct rv_monitor rv_sco;
-DECLARE_DA_MON_PER_CPU(sco, unsigned char);
+#include <rv/da_monitor.h>
static void handle_sched_set_state(void *data, struct task_struct *tsk, int state)
{
- da_handle_start_event_sco(sched_set_state_sco);
+ da_handle_start_event(sched_set_state_sco);
}
static void handle_schedule_entry(void *data, bool preempt)
{
- da_handle_event_sco(schedule_entry_sco);
+ da_handle_event(schedule_entry_sco);
}
static void handle_schedule_exit(void *data, bool is_switch)
{
- da_handle_start_event_sco(schedule_exit_sco);
+ da_handle_start_event(schedule_exit_sco);
}
static int enable_sco(void)
{
int retval;
- retval = da_monitor_init_sco();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -51,32 +49,32 @@ static int enable_sco(void)
static void disable_sco(void)
{
- rv_sco.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state);
rv_detach_trace_probe("sco", sched_entry_tp, handle_schedule_entry);
rv_detach_trace_probe("sco", sched_exit_tp, handle_schedule_exit);
- da_monitor_destroy_sco();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_sco = {
+static struct rv_monitor rv_this = {
.name = "sco",
.description = "scheduling context operations.",
.enable = enable_sco,
.disable = disable_sco,
- .reset = da_monitor_reset_all_sco,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_sco(void)
{
- return rv_register_monitor(&rv_sco, &rv_sched);
+ return rv_register_monitor(&rv_this, &rv_sched);
}
static void __exit unregister_sco(void)
{
- rv_unregister_monitor(&rv_sco);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_sco);
diff --git a/kernel/trace/rv/monitors/sco/sco.h b/kernel/trace/rv/monitors/sco/sco.h
index 7a4c1f2d5ca1..bac3beb51e72 100644
--- a/kernel/trace/rv/monitors/sco/sco.h
+++ b/kernel/trace/rv/monitors/sco/sco.h
@@ -5,19 +5,21 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME sco
+
enum states_sco {
- thread_context_sco = 0,
+ thread_context_sco,
scheduling_context_sco,
- state_max_sco
+ state_max_sco,
};
#define INVALID_STATE state_max_sco
enum events_sco {
- sched_set_state_sco = 0,
+ sched_set_state_sco,
schedule_entry_sco,
schedule_exit_sco,
- event_max_sco
+ event_max_sco,
};
struct automaton_sco {
@@ -31,12 +33,12 @@ struct automaton_sco {
static const struct automaton_sco automaton_sco = {
.state_names = {
"thread_context",
- "scheduling_context"
+ "scheduling_context",
},
.event_names = {
"sched_set_state",
"schedule_entry",
- "schedule_exit"
+ "schedule_exit",
},
.function = {
{ thread_context_sco, scheduling_context_sco, INVALID_STATE },
diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c
index 1e351ba52fee..83b48627dc9f 100644
--- a/kernel/trace/rv/monitors/scpd/scpd.c
+++ b/kernel/trace/rv/monitors/scpd/scpd.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "scpd"
@@ -15,36 +14,35 @@
#include <rv_trace.h>
#include <monitors/sched/sched.h>
+#define RV_MON_TYPE RV_MON_PER_CPU
#include "scpd.h"
-
-static struct rv_monitor rv_scpd;
-DECLARE_DA_MON_PER_CPU(scpd, unsigned char);
+#include <rv/da_monitor.h>
static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_event_scpd(preempt_disable_scpd);
+ da_handle_event(preempt_disable_scpd);
}
static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_start_event_scpd(preempt_enable_scpd);
+ da_handle_start_event(preempt_enable_scpd);
}
static void handle_schedule_entry(void *data, bool preempt)
{
- da_handle_event_scpd(schedule_entry_scpd);
+ da_handle_event(schedule_entry_scpd);
}
static void handle_schedule_exit(void *data, bool is_switch)
{
- da_handle_event_scpd(schedule_exit_scpd);
+ da_handle_event(schedule_exit_scpd);
}
static int enable_scpd(void)
{
int retval;
- retval = da_monitor_init_scpd();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -58,33 +56,33 @@ static int enable_scpd(void)
static void disable_scpd(void)
{
- rv_scpd.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("scpd", preempt_disable, handle_preempt_disable);
rv_detach_trace_probe("scpd", preempt_enable, handle_preempt_enable);
rv_detach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry);
rv_detach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit);
- da_monitor_destroy_scpd();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_scpd = {
+static struct rv_monitor rv_this = {
.name = "scpd",
.description = "schedule called with preemption disabled.",
.enable = enable_scpd,
.disable = disable_scpd,
- .reset = da_monitor_reset_all_scpd,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_scpd(void)
{
- return rv_register_monitor(&rv_scpd, &rv_sched);
+ return rv_register_monitor(&rv_this, &rv_sched);
}
static void __exit unregister_scpd(void)
{
- rv_unregister_monitor(&rv_scpd);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_scpd);
diff --git a/kernel/trace/rv/monitors/scpd/scpd.h b/kernel/trace/rv/monitors/scpd/scpd.h
index 295f735a5811..d6329da2671b 100644
--- a/kernel/trace/rv/monitors/scpd/scpd.h
+++ b/kernel/trace/rv/monitors/scpd/scpd.h
@@ -5,20 +5,22 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME scpd
+
enum states_scpd {
- cant_sched_scpd = 0,
+ cant_sched_scpd,
can_sched_scpd,
- state_max_scpd
+ state_max_scpd,
};
#define INVALID_STATE state_max_scpd
enum events_scpd {
- preempt_disable_scpd = 0,
+ preempt_disable_scpd,
preempt_enable_scpd,
schedule_entry_scpd,
schedule_exit_scpd,
- event_max_scpd
+ event_max_scpd,
};
struct automaton_scpd {
@@ -32,13 +34,13 @@ struct automaton_scpd {
static const struct automaton_scpd automaton_scpd = {
.state_names = {
"cant_sched",
- "can_sched"
+ "can_sched",
},
.event_names = {
"preempt_disable",
"preempt_enable",
"schedule_entry",
- "schedule_exit"
+ "schedule_exit",
},
.function = {
{ can_sched_scpd, INVALID_STATE, INVALID_STATE, INVALID_STATE },
diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c
index 558950f524a5..b80b73795dec 100644
--- a/kernel/trace/rv/monitors/snep/snep.c
+++ b/kernel/trace/rv/monitors/snep/snep.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "snep"
@@ -15,36 +14,35 @@
#include <rv_trace.h>
#include <monitors/sched/sched.h>
+#define RV_MON_TYPE RV_MON_PER_CPU
#include "snep.h"
-
-static struct rv_monitor rv_snep;
-DECLARE_DA_MON_PER_CPU(snep, unsigned char);
+#include <rv/da_monitor.h>
static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_start_event_snep(preempt_disable_snep);
+ da_handle_start_event(preempt_disable_snep);
}
static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_start_event_snep(preempt_enable_snep);
+ da_handle_start_event(preempt_enable_snep);
}
static void handle_schedule_entry(void *data, bool preempt)
{
- da_handle_event_snep(schedule_entry_snep);
+ da_handle_event(schedule_entry_snep);
}
static void handle_schedule_exit(void *data, bool is_switch)
{
- da_handle_start_event_snep(schedule_exit_snep);
+ da_handle_start_event(schedule_exit_snep);
}
static int enable_snep(void)
{
int retval;
- retval = da_monitor_init_snep();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -58,33 +56,33 @@ static int enable_snep(void)
static void disable_snep(void)
{
- rv_snep.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("snep", preempt_disable, handle_preempt_disable);
rv_detach_trace_probe("snep", preempt_enable, handle_preempt_enable);
rv_detach_trace_probe("snep", sched_entry_tp, handle_schedule_entry);
rv_detach_trace_probe("snep", sched_exit_tp, handle_schedule_exit);
- da_monitor_destroy_snep();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_snep = {
+static struct rv_monitor rv_this = {
.name = "snep",
.description = "schedule does not enable preempt.",
.enable = enable_snep,
.disable = disable_snep,
- .reset = da_monitor_reset_all_snep,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_snep(void)
{
- return rv_register_monitor(&rv_snep, &rv_sched);
+ return rv_register_monitor(&rv_this, &rv_sched);
}
static void __exit unregister_snep(void)
{
- rv_unregister_monitor(&rv_snep);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_snep);
diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h
index 4cd9abb77b7b..357520a5b3d1 100644
--- a/kernel/trace/rv/monitors/snep/snep.h
+++ b/kernel/trace/rv/monitors/snep/snep.h
@@ -5,20 +5,22 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME snep
+
enum states_snep {
- non_scheduling_context_snep = 0,
+ non_scheduling_context_snep,
scheduling_contex_snep,
- state_max_snep
+ state_max_snep,
};
#define INVALID_STATE state_max_snep
enum events_snep {
- preempt_disable_snep = 0,
+ preempt_disable_snep,
preempt_enable_snep,
schedule_entry_snep,
schedule_exit_snep,
- event_max_snep
+ event_max_snep,
};
struct automaton_snep {
@@ -32,26 +34,26 @@ struct automaton_snep {
static const struct automaton_snep automaton_snep = {
.state_names = {
"non_scheduling_context",
- "scheduling_contex"
+ "scheduling_contex",
},
.event_names = {
"preempt_disable",
"preempt_enable",
"schedule_entry",
- "schedule_exit"
+ "schedule_exit",
},
.function = {
{
non_scheduling_context_snep,
non_scheduling_context_snep,
scheduling_contex_snep,
- INVALID_STATE
+ INVALID_STATE,
},
{
INVALID_STATE,
INVALID_STATE,
INVALID_STATE,
- non_scheduling_context_snep
+ non_scheduling_context_snep,
},
},
.initial_state = non_scheduling_context_snep,
diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c
index 540e686e699f..f168b1a4b12c 100644
--- a/kernel/trace/rv/monitors/snroc/snroc.c
+++ b/kernel/trace/rv/monitors/snroc/snroc.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "snroc"
@@ -14,14 +13,13 @@
#include <rv_trace.h>
#include <monitors/sched/sched.h>
+#define RV_MON_TYPE RV_MON_PER_TASK
#include "snroc.h"
-
-static struct rv_monitor rv_snroc;
-DECLARE_DA_MON_PER_TASK(snroc, unsigned char);
+#include <rv/da_monitor.h>
static void handle_sched_set_state(void *data, struct task_struct *tsk, int state)
{
- da_handle_event_snroc(tsk, sched_set_state_snroc);
+ da_handle_event(tsk, sched_set_state_snroc);
}
static void handle_sched_switch(void *data, bool preempt,
@@ -29,15 +27,15 @@ static void handle_sched_switch(void *data, bool preempt,
struct task_struct *next,
unsigned int prev_state)
{
- da_handle_start_event_snroc(prev, sched_switch_out_snroc);
- da_handle_event_snroc(next, sched_switch_in_snroc);
+ da_handle_start_event(prev, sched_switch_out_snroc);
+ da_handle_event(next, sched_switch_in_snroc);
}
static int enable_snroc(void)
{
int retval;
- retval = da_monitor_init_snroc();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -49,31 +47,31 @@ static int enable_snroc(void)
static void disable_snroc(void)
{
- rv_snroc.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state);
rv_detach_trace_probe("snroc", sched_switch, handle_sched_switch);
- da_monitor_destroy_snroc();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_snroc = {
+static struct rv_monitor rv_this = {
.name = "snroc",
.description = "set non runnable on its own context.",
.enable = enable_snroc,
.disable = disable_snroc,
- .reset = da_monitor_reset_all_snroc,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_snroc(void)
{
- return rv_register_monitor(&rv_snroc, &rv_sched);
+ return rv_register_monitor(&rv_this, &rv_sched);
}
static void __exit unregister_snroc(void)
{
- rv_unregister_monitor(&rv_snroc);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_snroc);
diff --git a/kernel/trace/rv/monitors/snroc/snroc.h b/kernel/trace/rv/monitors/snroc/snroc.h
index c3650a2b1b10..88b7328ad31a 100644
--- a/kernel/trace/rv/monitors/snroc/snroc.h
+++ b/kernel/trace/rv/monitors/snroc/snroc.h
@@ -5,19 +5,21 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME snroc
+
enum states_snroc {
- other_context_snroc = 0,
+ other_context_snroc,
own_context_snroc,
- state_max_snroc
+ state_max_snroc,
};
#define INVALID_STATE state_max_snroc
enum events_snroc {
- sched_set_state_snroc = 0,
+ sched_set_state_snroc,
sched_switch_in_snroc,
sched_switch_out_snroc,
- event_max_snroc
+ event_max_snroc,
};
struct automaton_snroc {
@@ -31,12 +33,12 @@ struct automaton_snroc {
static const struct automaton_snroc automaton_snroc = {
.state_names = {
"other_context",
- "own_context"
+ "own_context",
},
.event_names = {
"sched_set_state",
"sched_switch_in",
- "sched_switch_out"
+ "sched_switch_out",
},
.function = {
{ INVALID_STATE, own_context_snroc, INVALID_STATE },
diff --git a/kernel/trace/rv/monitors/sssw/sssw.c b/kernel/trace/rv/monitors/sssw/sssw.c
index 84b8d890d9d4..a91321c890cd 100644
--- a/kernel/trace/rv/monitors/sssw/sssw.c
+++ b/kernel/trace/rv/monitors/sssw/sssw.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "sssw"
@@ -15,17 +14,16 @@
#include <rv_trace.h>
#include <monitors/sched/sched.h>
+#define RV_MON_TYPE RV_MON_PER_TASK
#include "sssw.h"
-
-static struct rv_monitor rv_sssw;
-DECLARE_DA_MON_PER_TASK(sssw, unsigned char);
+#include <rv/da_monitor.h>
static void handle_sched_set_state(void *data, struct task_struct *tsk, int state)
{
if (state == TASK_RUNNING)
- da_handle_start_event_sssw(tsk, sched_set_state_runnable_sssw);
+ da_handle_start_event(tsk, sched_set_state_runnable_sssw);
else
- da_handle_event_sssw(tsk, sched_set_state_sleepable_sssw);
+ da_handle_event(tsk, sched_set_state_sleepable_sssw);
}
static void handle_sched_switch(void *data, bool preempt,
@@ -34,15 +32,15 @@ static void handle_sched_switch(void *data, bool preempt,
unsigned int prev_state)
{
if (preempt)
- da_handle_event_sssw(prev, sched_switch_preempt_sssw);
+ da_handle_event(prev, sched_switch_preempt_sssw);
else if (prev_state == TASK_RUNNING)
- da_handle_event_sssw(prev, sched_switch_yield_sssw);
+ da_handle_event(prev, sched_switch_yield_sssw);
else if (prev_state == TASK_RTLOCK_WAIT)
/* special case of sleeping task with racy conditions */
- da_handle_event_sssw(prev, sched_switch_blocking_sssw);
+ da_handle_event(prev, sched_switch_blocking_sssw);
else
- da_handle_event_sssw(prev, sched_switch_suspend_sssw);
- da_handle_event_sssw(next, sched_switch_in_sssw);
+ da_handle_event(prev, sched_switch_suspend_sssw);
+ da_handle_event(next, sched_switch_in_sssw);
}
static void handle_sched_wakeup(void *data, struct task_struct *p)
@@ -51,21 +49,21 @@ static void handle_sched_wakeup(void *data, struct task_struct *p)
* Wakeup can also lead to signal_wakeup although the system is
* actually runnable. The monitor can safely start with this event.
*/
- da_handle_start_event_sssw(p, sched_wakeup_sssw);
+ da_handle_start_event(p, sched_wakeup_sssw);
}
static void handle_signal_deliver(void *data, int sig,
struct kernel_siginfo *info,
struct k_sigaction *ka)
{
- da_handle_event_sssw(current, signal_deliver_sssw);
+ da_handle_event(current, signal_deliver_sssw);
}
static int enable_sssw(void)
{
int retval;
- retval = da_monitor_init_sssw();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -79,33 +77,33 @@ static int enable_sssw(void)
static void disable_sssw(void)
{
- rv_sssw.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state);
rv_detach_trace_probe("sssw", sched_switch, handle_sched_switch);
rv_detach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup);
rv_detach_trace_probe("sssw", signal_deliver, handle_signal_deliver);
- da_monitor_destroy_sssw();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_sssw = {
+static struct rv_monitor rv_this = {
.name = "sssw",
.description = "set state sleep and wakeup.",
.enable = enable_sssw,
.disable = disable_sssw,
- .reset = da_monitor_reset_all_sssw,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_sssw(void)
{
- return rv_register_monitor(&rv_sssw, &rv_sched);
+ return rv_register_monitor(&rv_this, &rv_sched);
}
static void __exit unregister_sssw(void)
{
- rv_unregister_monitor(&rv_sssw);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_sssw);
diff --git a/kernel/trace/rv/monitors/sssw/sssw.h b/kernel/trace/rv/monitors/sssw/sssw.h
index 243d54050c94..1a4b806061c3 100644
--- a/kernel/trace/rv/monitors/sssw/sssw.h
+++ b/kernel/trace/rv/monitors/sssw/sssw.h
@@ -5,18 +5,20 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME sssw
+
enum states_sssw {
- runnable_sssw = 0,
+ runnable_sssw,
signal_wakeup_sssw,
sleepable_sssw,
sleeping_sssw,
- state_max_sssw
+ state_max_sssw,
};
#define INVALID_STATE state_max_sssw
enum events_sssw {
- sched_set_state_runnable_sssw = 0,
+ sched_set_state_runnable_sssw,
sched_set_state_sleepable_sssw,
sched_switch_blocking_sssw,
sched_switch_in_sssw,
@@ -25,7 +27,7 @@ enum events_sssw {
sched_switch_yield_sssw,
sched_wakeup_sssw,
signal_deliver_sssw,
- event_max_sssw
+ event_max_sssw,
};
struct automaton_sssw {
@@ -41,7 +43,7 @@ static const struct automaton_sssw automaton_sssw = {
"runnable",
"signal_wakeup",
"sleepable",
- "sleeping"
+ "sleeping",
},
.event_names = {
"sched_set_state_runnable",
@@ -52,7 +54,7 @@ static const struct automaton_sssw automaton_sssw = {
"sched_switch_suspend",
"sched_switch_yield",
"sched_wakeup",
- "signal_deliver"
+ "signal_deliver",
},
.function = {
{
@@ -64,7 +66,7 @@ static const struct automaton_sssw automaton_sssw = {
INVALID_STATE,
runnable_sssw,
runnable_sssw,
- runnable_sssw
+ runnable_sssw,
},
{
INVALID_STATE,
@@ -75,7 +77,7 @@ static const struct automaton_sssw automaton_sssw = {
INVALID_STATE,
signal_wakeup_sssw,
signal_wakeup_sssw,
- runnable_sssw
+ runnable_sssw,
},
{
runnable_sssw,
@@ -86,7 +88,7 @@ static const struct automaton_sssw automaton_sssw = {
sleeping_sssw,
signal_wakeup_sssw,
runnable_sssw,
- sleepable_sssw
+ sleepable_sssw,
},
{
INVALID_STATE,
@@ -97,7 +99,7 @@ static const struct automaton_sssw automaton_sssw = {
INVALID_STATE,
INVALID_STATE,
runnable_sssw,
- INVALID_STATE
+ INVALID_STATE,
},
},
.initial_state = runnable_sssw,
diff --git a/kernel/trace/rv/monitors/sts/sts.c b/kernel/trace/rv/monitors/sts/sts.c
index c4a9cd67c1d2..ce031cbf202a 100644
--- a/kernel/trace/rv/monitors/sts/sts.c
+++ b/kernel/trace/rv/monitors/sts/sts.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "sts"
@@ -16,17 +15,16 @@
#include <rv_trace.h>
#include <monitors/sched/sched.h>
+#define RV_MON_TYPE RV_MON_PER_CPU
#include "sts.h"
-
-static struct rv_monitor rv_sts;
-DECLARE_DA_MON_PER_CPU(sts, unsigned char);
+#include <rv/da_monitor.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/trace/irq_vectors.h>
static void handle_vector_irq_entry(void *data, int vector)
{
- da_handle_event_sts(irq_entry_sts);
+ da_handle_event(irq_entry_sts);
}
static void attach_vector_irq(void)
@@ -61,17 +59,17 @@ static void detach_vector_irq(void) { }
static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_event_sts(irq_disable_sts);
+ da_handle_event(irq_disable_sts);
}
static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_event_sts(irq_enable_sts);
+ da_handle_event(irq_enable_sts);
}
static void handle_irq_entry(void *data, int irq, struct irqaction *action)
{
- da_handle_event_sts(irq_entry_sts);
+ da_handle_event(irq_entry_sts);
}
static void handle_sched_switch(void *data, bool preempt,
@@ -79,24 +77,24 @@ static void handle_sched_switch(void *data, bool preempt,
struct task_struct *next,
unsigned int prev_state)
{
- da_handle_event_sts(sched_switch_sts);
+ da_handle_event(sched_switch_sts);
}
static void handle_schedule_entry(void *data, bool preempt)
{
- da_handle_event_sts(schedule_entry_sts);
+ da_handle_event(schedule_entry_sts);
}
static void handle_schedule_exit(void *data, bool is_switch)
{
- da_handle_start_event_sts(schedule_exit_sts);
+ da_handle_start_event(schedule_exit_sts);
}
static int enable_sts(void)
{
int retval;
- retval = da_monitor_init_sts();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -113,7 +111,7 @@ static int enable_sts(void)
static void disable_sts(void)
{
- rv_sts.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("sts", irq_disable, handle_irq_disable);
rv_detach_trace_probe("sts", irq_enable, handle_irq_enable);
@@ -123,29 +121,29 @@ static void disable_sts(void)
rv_detach_trace_probe("sts", sched_exit_tp, handle_schedule_exit);
detach_vector_irq();
- da_monitor_destroy_sts();
+ da_monitor_destroy();
}
/*
* This is the monitor register section.
*/
-static struct rv_monitor rv_sts = {
+static struct rv_monitor rv_this = {
.name = "sts",
.description = "schedule implies task switch.",
.enable = enable_sts,
.disable = disable_sts,
- .reset = da_monitor_reset_all_sts,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_sts(void)
{
- return rv_register_monitor(&rv_sts, &rv_sched);
+ return rv_register_monitor(&rv_this, &rv_sched);
}
static void __exit unregister_sts(void)
{
- rv_unregister_monitor(&rv_sts);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_sts);
diff --git a/kernel/trace/rv/monitors/sts/sts.h b/kernel/trace/rv/monitors/sts/sts.h
index 3368b6599a00..6f7b2d9d72e6 100644
--- a/kernel/trace/rv/monitors/sts/sts.h
+++ b/kernel/trace/rv/monitors/sts/sts.h
@@ -5,27 +5,29 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME sts
+
enum states_sts {
- can_sched_sts = 0,
+ can_sched_sts,
cant_sched_sts,
disable_to_switch_sts,
enable_to_exit_sts,
in_irq_sts,
scheduling_sts,
switching_sts,
- state_max_sts
+ state_max_sts,
};
#define INVALID_STATE state_max_sts
enum events_sts {
- irq_disable_sts = 0,
+ irq_disable_sts,
irq_enable_sts,
irq_entry_sts,
sched_switch_sts,
schedule_entry_sts,
schedule_exit_sts,
- event_max_sts
+ event_max_sts,
};
struct automaton_sts {
@@ -44,7 +46,7 @@ static const struct automaton_sts automaton_sts = {
"enable_to_exit",
"in_irq",
"scheduling",
- "switching"
+ "switching",
},
.event_names = {
"irq_disable",
@@ -52,7 +54,7 @@ static const struct automaton_sts automaton_sts = {
"irq_entry",
"sched_switch",
"schedule_entry",
- "schedule_exit"
+ "schedule_exit",
},
.function = {
{
@@ -61,7 +63,7 @@ static const struct automaton_sts automaton_sts = {
INVALID_STATE,
INVALID_STATE,
scheduling_sts,
- INVALID_STATE
+ INVALID_STATE,
},
{
INVALID_STATE,
@@ -69,7 +71,7 @@ static const struct automaton_sts automaton_sts = {
cant_sched_sts,
INVALID_STATE,
INVALID_STATE,
- INVALID_STATE
+ INVALID_STATE,
},
{
INVALID_STATE,
@@ -77,7 +79,7 @@ static const struct automaton_sts automaton_sts = {
in_irq_sts,
switching_sts,
INVALID_STATE,
- INVALID_STATE
+ INVALID_STATE,
},
{
enable_to_exit_sts,
@@ -85,7 +87,7 @@ static const struct automaton_sts automaton_sts = {
enable_to_exit_sts,
INVALID_STATE,
INVALID_STATE,
- can_sched_sts
+ can_sched_sts,
},
{
INVALID_STATE,
@@ -93,7 +95,7 @@ static const struct automaton_sts automaton_sts = {
in_irq_sts,
INVALID_STATE,
INVALID_STATE,
- INVALID_STATE
+ INVALID_STATE,
},
{
disable_to_switch_sts,
@@ -101,7 +103,7 @@ static const struct automaton_sts automaton_sts = {
INVALID_STATE,
INVALID_STATE,
INVALID_STATE,
- INVALID_STATE
+ INVALID_STATE,
},
{
INVALID_STATE,
@@ -109,7 +111,7 @@ static const struct automaton_sts automaton_sts = {
INVALID_STATE,
INVALID_STATE,
INVALID_STATE,
- INVALID_STATE
+ INVALID_STATE,
},
},
.initial_state = can_sched_sts,
diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c
index 4b4e99615a11..22d77ec42463 100644
--- a/kernel/trace/rv/monitors/wip/wip.c
+++ b/kernel/trace/rv/monitors/wip/wip.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "wip"
@@ -14,31 +13,30 @@
#include <trace/events/sched.h>
#include <trace/events/preemptirq.h>
+#define RV_MON_TYPE RV_MON_PER_CPU
#include "wip.h"
-
-static struct rv_monitor rv_wip;
-DECLARE_DA_MON_PER_CPU(wip, unsigned char);
+#include <rv/da_monitor.h>
static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_event_wip(preempt_disable_wip);
+ da_handle_event(preempt_disable_wip);
}
static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_start_event_wip(preempt_enable_wip);
+ da_handle_start_event(preempt_enable_wip);
}
static void handle_sched_waking(void *data, struct task_struct *task)
{
- da_handle_event_wip(sched_waking_wip);
+ da_handle_event(sched_waking_wip);
}
static int enable_wip(void)
{
int retval;
- retval = da_monitor_init_wip();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -51,32 +49,32 @@ static int enable_wip(void)
static void disable_wip(void)
{
- rv_wip.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("wip", preempt_disable, handle_preempt_disable);
rv_detach_trace_probe("wip", preempt_enable, handle_preempt_enable);
rv_detach_trace_probe("wip", sched_waking, handle_sched_waking);
- da_monitor_destroy_wip();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_wip = {
+static struct rv_monitor rv_this = {
.name = "wip",
.description = "wakeup in preemptive per-cpu testing monitor.",
.enable = enable_wip,
.disable = disable_wip,
- .reset = da_monitor_reset_all_wip,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_wip(void)
{
- return rv_register_monitor(&rv_wip, NULL);
+ return rv_register_monitor(&rv_this, NULL);
}
static void __exit unregister_wip(void)
{
- rv_unregister_monitor(&rv_wip);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_wip);
diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h
index c7193748bf36..b4c3eea94c86 100644
--- a/kernel/trace/rv/monitors/wip/wip.h
+++ b/kernel/trace/rv/monitors/wip/wip.h
@@ -5,19 +5,21 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME wip
+
enum states_wip {
- preemptive_wip = 0,
+ preemptive_wip,
non_preemptive_wip,
- state_max_wip
+ state_max_wip,
};
#define INVALID_STATE state_max_wip
enum events_wip {
- preempt_disable_wip = 0,
+ preempt_disable_wip,
preempt_enable_wip,
sched_waking_wip,
- event_max_wip
+ event_max_wip,
};
struct automaton_wip {
@@ -31,12 +33,12 @@ struct automaton_wip {
static const struct automaton_wip automaton_wip = {
.state_names = {
"preemptive",
- "non_preemptive"
+ "non_preemptive",
},
.event_names = {
"preempt_disable",
"preempt_enable",
- "sched_waking"
+ "sched_waking",
},
.function = {
{ non_preemptive_wip, INVALID_STATE, INVALID_STATE },
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c
index 4145bea2729e..579e7e217ee0 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.c
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.c
@@ -6,40 +6,38 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "wwnr"
#include <rv_trace.h>
#include <trace/events/sched.h>
+#define RV_MON_TYPE RV_MON_PER_TASK
#include "wwnr.h"
-
-static struct rv_monitor rv_wwnr;
-DECLARE_DA_MON_PER_TASK(wwnr, unsigned char);
+#include <rv/da_monitor.h>
static void handle_switch(void *data, bool preempt, struct task_struct *p,
struct task_struct *n, unsigned int prev_state)
{
/* start monitoring only after the first suspension */
if (prev_state == TASK_INTERRUPTIBLE)
- da_handle_start_event_wwnr(p, switch_out_wwnr);
+ da_handle_start_event(p, switch_out_wwnr);
else
- da_handle_event_wwnr(p, switch_out_wwnr);
+ da_handle_event(p, switch_out_wwnr);
- da_handle_event_wwnr(n, switch_in_wwnr);
+ da_handle_event(n, switch_in_wwnr);
}
static void handle_wakeup(void *data, struct task_struct *p)
{
- da_handle_event_wwnr(p, wakeup_wwnr);
+ da_handle_event(p, wakeup_wwnr);
}
static int enable_wwnr(void)
{
int retval;
- retval = da_monitor_init_wwnr();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -51,31 +49,31 @@ static int enable_wwnr(void)
static void disable_wwnr(void)
{
- rv_wwnr.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("wwnr", sched_switch, handle_switch);
rv_detach_trace_probe("wwnr", sched_wakeup, handle_wakeup);
- da_monitor_destroy_wwnr();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_wwnr = {
+static struct rv_monitor rv_this = {
.name = "wwnr",
.description = "wakeup while not running per-task testing model.",
.enable = enable_wwnr,
.disable = disable_wwnr,
- .reset = da_monitor_reset_all_wwnr,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_wwnr(void)
{
- return rv_register_monitor(&rv_wwnr, NULL);
+ return rv_register_monitor(&rv_this, NULL);
}
static void __exit unregister_wwnr(void)
{
- rv_unregister_monitor(&rv_wwnr);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_wwnr);
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h
index 0a59d23edf61..a28006512c9b 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.h
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.h
@@ -5,19 +5,21 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME wwnr
+
enum states_wwnr {
- not_running_wwnr = 0,
+ not_running_wwnr,
running_wwnr,
- state_max_wwnr
+ state_max_wwnr,
};
#define INVALID_STATE state_max_wwnr
enum events_wwnr {
- switch_in_wwnr = 0,
+ switch_in_wwnr,
switch_out_wwnr,
wakeup_wwnr,
- event_max_wwnr
+ event_max_wwnr,
};
struct automaton_wwnr {
@@ -31,12 +33,12 @@ struct automaton_wwnr {
static const struct automaton_wwnr automaton_wwnr = {
.state_names = {
"not_running",
- "running"
+ "running",
},
.event_names = {
"switch_in",
"switch_out",
- "wakeup"
+ "wakeup",
},
.function = {
{ running_wwnr, INVALID_STATE, not_running_wwnr },
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index baec63134ab6..b1cb30a7b83d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1178,11 +1178,10 @@ EXPORT_SYMBOL_GPL(__trace_array_puts);
* __trace_puts - write a constant string into the trace buffer.
* @ip: The address of the caller
* @str: The constant string to write
- * @size: The size of the string.
*/
-int __trace_puts(unsigned long ip, const char *str, int size)
+int __trace_puts(unsigned long ip, const char *str)
{
- return __trace_array_puts(printk_trace, ip, str, size);
+ return __trace_array_puts(printk_trace, ip, str, strlen(str));
}
EXPORT_SYMBOL_GPL(__trace_puts);
@@ -1201,7 +1200,7 @@ int __trace_bputs(unsigned long ip, const char *str)
int size = sizeof(struct bputs_entry);
if (!printk_binsafe(tr))
- return __trace_puts(ip, str, strlen(str));
+ return __trace_puts(ip, str);
if (!(tr->trace_flags & TRACE_ITER(PRINTK)))
return 0;
@@ -6115,10 +6114,10 @@ static int cmp_mod_entry(const void *key, const void *pivot)
unsigned long addr = (unsigned long)key;
const struct trace_mod_entry *ent = pivot;
- if (addr >= ent[0].mod_addr && addr < ent[1].mod_addr)
- return 0;
- else
- return addr - ent->mod_addr;
+ if (addr < ent[0].mod_addr)
+ return -1;
+
+ return addr >= ent[1].mod_addr;
}
/**
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b6d42fe06115..8428c437cb9d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -68,14 +68,17 @@ enum trace_type {
#undef __field_fn
#define __field_fn(type, item) type item;
+#undef __field_packed
+#define __field_packed(type, item) type item;
+
#undef __field_struct
#define __field_struct(type, item) __field(type, item)
#undef __field_desc
#define __field_desc(type, container, item)
-#undef __field_packed
-#define __field_packed(type, container, item)
+#undef __field_desc_packed
+#define __field_desc_packed(type, container, item)
#undef __array
#define __array(type, item, size) type item[size];
@@ -2116,7 +2119,7 @@ extern void tracing_log_err(struct trace_array *tr,
* about performance). The internal_trace_puts() is for such
* a purpose.
*/
-#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))
+#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str)
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index f6a8d29c0d76..54417468fdeb 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -79,8 +79,8 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ent, graph_ent )
- __field_packed( unsigned long, graph_ent, func )
- __field_packed( unsigned long, graph_ent, depth )
+ __field_desc_packed(unsigned long, graph_ent, func )
+ __field_desc_packed(unsigned long, graph_ent, depth )
__dynamic_array(unsigned long, args )
),
@@ -96,9 +96,9 @@ FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry,
F_STRUCT(
__field_struct( struct fgraph_retaddr_ent, graph_rent )
- __field_packed( unsigned long, graph_rent.ent, func )
- __field_packed( unsigned long, graph_rent.ent, depth )
- __field_packed( unsigned long, graph_rent, retaddr )
+ __field_desc_packed( unsigned long, graph_rent.ent, func )
+ __field_desc_packed( unsigned long, graph_rent.ent, depth )
+ __field_desc_packed( unsigned long, graph_rent, retaddr )
__dynamic_array(unsigned long, args )
),
@@ -123,12 +123,12 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ret, ret )
- __field_packed( unsigned long, ret, func )
- __field_packed( unsigned long, ret, retval )
- __field_packed( unsigned int, ret, depth )
- __field_packed( unsigned int, ret, overrun )
- __field(unsigned long long, calltime )
- __field(unsigned long long, rettime )
+ __field_desc_packed( unsigned long, ret, func )
+ __field_desc_packed( unsigned long, ret, retval )
+ __field_desc_packed( unsigned int, ret, depth )
+ __field_desc_packed( unsigned int, ret, overrun )
+ __field_packed(unsigned long long, calltime)
+ __field_packed(unsigned long long, rettime )
),
F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u retval: %lx",
@@ -146,11 +146,11 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ret, ret )
- __field_packed( unsigned long, ret, func )
- __field_packed( unsigned int, ret, depth )
- __field_packed( unsigned int, ret, overrun )
- __field(unsigned long long, calltime )
- __field(unsigned long long, rettime )
+ __field_desc_packed( unsigned long, ret, func )
+ __field_desc_packed( unsigned int, ret, depth )
+ __field_desc_packed( unsigned int, ret, overrun )
+ __field_packed(unsigned long long, calltime )
+ __field_packed(unsigned long long, rettime )
),
F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u",
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 5e6e70540eef..c97bb2fda5c0 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -2057,6 +2057,15 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING;
else
hist_field->fn_num = HIST_FIELD_FN_PSTRING;
+ } else if (field->filter_type == FILTER_STACKTRACE) {
+ flags |= HIST_FIELD_FL_STACKTRACE;
+
+ hist_field->size = MAX_FILTER_STR_VAL;
+ hist_field->type = kstrdup_const(field->type, GFP_KERNEL);
+ if (!hist_field->type)
+ goto free;
+
+ hist_field->fn_num = HIST_FIELD_FN_STACK;
} else {
hist_field->size = field->size;
hist_field->is_signed = field->is_signed;
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 4554c458b78c..45c187e77e21 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -130,7 +130,9 @@ static int synth_event_define_fields(struct trace_event_call *call)
struct synth_event *event = call->data;
unsigned int i, size, n_u64;
char *name, *type;
+ int filter_type;
bool is_signed;
+ bool is_stack;
int ret = 0;
for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
@@ -138,8 +140,12 @@ static int synth_event_define_fields(struct trace_event_call *call)
is_signed = event->fields[i]->is_signed;
type = event->fields[i]->type;
name = event->fields[i]->name;
+ is_stack = event->fields[i]->is_stack;
+
+ filter_type = is_stack ? FILTER_STACKTRACE : FILTER_OTHER;
+
ret = trace_define_field(call, type, name, offset, size,
- is_signed, FILTER_OTHER);
+ is_signed, filter_type);
if (ret)
break;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 1698fc22afa0..32a42ef31855 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -42,11 +42,14 @@ static int ftrace_event_register(struct trace_event_call *call,
#undef __field_fn
#define __field_fn(type, item) type item;
+#undef __field_packed
+#define __field_packed(type, item) type item;
+
#undef __field_desc
#define __field_desc(type, container, item) type item;
-#undef __field_packed
-#define __field_packed(type, container, item) type item;
+#undef __field_desc_packed
+#define __field_desc_packed(type, container, item) type item;
#undef __array
#define __array(type, item, size) type item[size];
@@ -104,11 +107,14 @@ static void __always_unused ____ftrace_check_##name(void) \
#undef __field_fn
#define __field_fn(_type, _item) __field_ext(_type, _item, FILTER_TRACE_FN)
+#undef __field_packed
+#define __field_packed(_type, _item) __field_ext_packed(_type, _item, FILTER_OTHER)
+
#undef __field_desc
#define __field_desc(_type, _container, _item) __field_ext(_type, _item, FILTER_OTHER)
-#undef __field_packed
-#define __field_packed(_type, _container, _item) __field_ext_packed(_type, _item, FILTER_OTHER)
+#undef __field_desc_packed
+#define __field_desc_packed(_type, _container, _item) __field_ext_packed(_type, _item, FILTER_OTHER)
#undef __array
#define __array(_type, _item, _len) { \
@@ -146,11 +152,14 @@ static struct trace_event_fields ftrace_event_fields_##name[] = { \
#undef __field_fn
#define __field_fn(type, item)
+#undef __field_packed
+#define __field_packed(type, item)
+
#undef __field_desc
#define __field_desc(type, container, item)
-#undef __field_packed
-#define __field_packed(type, container, item)
+#undef __field_desc_packed
+#define __field_desc_packed(type, container, item)
#undef __array
#define __array(type, item, len)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b1e9c9913309..1de6f1573621 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -901,7 +901,7 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr
trace_seq_printf(s, "%ps", func);
if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) {
- print_function_args(s, entry->args, (unsigned long)func);
+ print_function_args(s, FGRAPH_ENTRY_ARGS(entry), (unsigned long)func);
trace_seq_putc(s, ';');
} else
trace_seq_puts(s, "();");
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 6ea2f6363b90..5c153106e642 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -125,7 +125,7 @@ static void __acct_update_integrals(struct task_struct *tsk,
{
u64 time, delta;
- if (!likely(tsk->mm))
+ if (unlikely(!tsk->mm || (tsk->flags & PF_KTHREAD)))
return;
time = stime + utime;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 586af49fc03e..fc4a8f2d3096 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -47,7 +47,7 @@ static int set_permissions(struct ctl_table_header *head,
int mode;
/* Allow users with CAP_SYS_RESOURCE unrestrained access */
- if (ns_capable(user_ns, CAP_SYS_RESOURCE))
+ if (ns_capable_noaudit(user_ns, CAP_SYS_RESOURCE))
mode = (table->mode & S_IRWXU) >> 6;
else
/* Allow all others at most read-only access */
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index 39e270789444..90ab3c1a205e 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -31,6 +31,7 @@ static int unwind_user_next_common(struct unwind_user_state *state,
{
unsigned long cfa, fp, ra;
+ /* Get the Canonical Frame Address (CFA) */
if (frame->use_fp) {
if (state->fp < state->sp)
return -EINVAL;
@@ -38,11 +39,9 @@ static int unwind_user_next_common(struct unwind_user_state *state,
} else {
cfa = state->sp;
}
-
- /* Get the Canonical Frame Address (CFA) */
cfa += frame->cfa_off;
- /* stack going in wrong direction? */
+ /* Make sure that stack is not going in wrong direction */
if (cfa <= state->sp)
return -EINVAL;
@@ -50,10 +49,11 @@ static int unwind_user_next_common(struct unwind_user_state *state,
if (cfa & (state->ws - 1))
return -EINVAL;
- /* Find the Return Address (RA) */
+ /* Get the Return Address (RA) */
if (get_user_word(&ra, cfa, frame->ra_off, state->ws))
return -EINVAL;
+ /* Get the Frame Pointer (FP) */
if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws))
return -EINVAL;
@@ -67,7 +67,6 @@ static int unwind_user_next_common(struct unwind_user_state *state,
static int unwind_user_next_fp(struct unwind_user_state *state)
{
-#ifdef CONFIG_HAVE_UNWIND_USER_FP
struct pt_regs *regs = task_pt_regs(current);
if (state->topmost && unwind_user_at_function_start(regs)) {
@@ -81,9 +80,6 @@ static int unwind_user_next_fp(struct unwind_user_state *state)
ARCH_INIT_USER_FP_FRAME(state->ws)
};
return unwind_user_next_common(state, &fp_frame);
-#else
- return -EINVAL;
-#endif
}
static int unwind_user_next(struct unwind_user_state *state)
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index fe9bf8db1922..8d82913223a1 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -36,7 +36,11 @@ struct hwerr_info {
time64_t timestamp;
};
-static struct hwerr_info hwerr_data[HWERR_RECOV_MAX];
+/*
+ * The hwerr_data[] array is declared with global scope so that it remains
+ * accessible to vmcoreinfo even when Link Time Optimization (LTO) is enabled.
+ */
+struct hwerr_info hwerr_data[HWERR_RECOV_MAX];
Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
void *data, size_t data_len)
@@ -137,7 +141,9 @@ EXPORT_SYMBOL_GPL(hwerr_log_error_type);
static int __init crash_save_vmcoreinfo_init(void)
{
- vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
+ int order;
+ order = get_order(VMCOREINFO_BYTES);
+ vmcoreinfo_data = (unsigned char *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
if (!vmcoreinfo_data) {
pr_warn("Memory allocation for vmcoreinfo_data failed\n");
return -ENOMEM;
@@ -146,7 +152,7 @@ static int __init crash_save_vmcoreinfo_init(void)
vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
GFP_KERNEL | __GFP_ZERO);
if (!vmcoreinfo_note) {
- free_page((unsigned long)vmcoreinfo_data);
+ free_pages((unsigned long)vmcoreinfo_data, order);
vmcoreinfo_data = NULL;
pr_warn("Memory allocation for vmcoreinfo_note failed\n");
return -ENOMEM;
@@ -238,7 +244,6 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(kallsyms_token_table);
VMCOREINFO_SYMBOL(kallsyms_token_index);
VMCOREINFO_SYMBOL(kallsyms_offsets);
- VMCOREINFO_SYMBOL(kallsyms_relative_base);
#endif /* CONFIG_KALLSYMS */
arch_crash_save_vmcoreinfo();
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 366122f4a0f8..7d675781bc91 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -363,7 +363,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly;
/* Global variables, exported for sysctl */
unsigned int __read_mostly softlockup_panic =
- IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC);
+ CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC;
static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;
@@ -550,7 +550,7 @@ static bool need_counting_irqs(void)
u8 util;
int tail = __this_cpu_read(cpustat_tail);
- tail = (tail + NUM_HARDIRQ_REPORT - 1) % NUM_HARDIRQ_REPORT;
+ tail = (tail + NUM_SAMPLE_PERIODS - 1) % NUM_SAMPLE_PERIODS;
util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]);
return util > HARDIRQ_PERCENT_THRESH;
}
@@ -774,8 +774,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
unsigned long touch_ts, period_ts, now;
struct pt_regs *regs = get_irq_regs();
- int duration;
int softlockup_all_cpu_backtrace;
+ int duration, thresh_count;
unsigned long flags;
if (!watchdog_enabled)
@@ -879,7 +879,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT);
- if (softlockup_panic)
+ thresh_count = duration / get_softlockup_thresh();
+
+ if (softlockup_panic && thresh_count >= softlockup_panic)
panic("softlockup: hung tasks");
}
@@ -1228,7 +1230,7 @@ static const struct ctl_table watchdog_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "softlockup_sys_info",
diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index d3ca70e3c256..cf05775a96d3 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -118,18 +118,11 @@ static void watchdog_overflow_callback(struct perf_event *event,
watchdog_hardlockup_check(smp_processor_id(), regs);
}
-static int hardlockup_detector_event_create(void)
+static struct perf_event *hardlockup_detector_event_create(unsigned int cpu)
{
- unsigned int cpu;
struct perf_event_attr *wd_attr;
struct perf_event *evt;
- /*
- * Preemption is not disabled because memory will be allocated.
- * Ensure CPU-locality by calling this in per-CPU kthread.
- */
- WARN_ON(!is_percpu_thread());
- cpu = raw_smp_processor_id();
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
@@ -143,14 +136,7 @@ static int hardlockup_detector_event_create(void)
watchdog_overflow_callback, NULL);
}
- if (IS_ERR(evt)) {
- pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
- PTR_ERR(evt));
- return PTR_ERR(evt);
- }
- WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak");
- this_cpu_write(watchdog_ev, evt);
- return 0;
+ return evt;
}
/**
@@ -159,17 +145,26 @@ static int hardlockup_detector_event_create(void)
*/
void watchdog_hardlockup_enable(unsigned int cpu)
{
+ struct perf_event *evt;
+
WARN_ON_ONCE(cpu != smp_processor_id());
- if (hardlockup_detector_event_create())
+ evt = hardlockup_detector_event_create(cpu);
+ if (IS_ERR(evt)) {
+ pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
+ PTR_ERR(evt));
return;
+ }
/* use original value for check */
if (!atomic_fetch_inc(&watchdog_cpus))
pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
+ WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak");
+ this_cpu_write(watchdog_ev, evt);
+
watchdog_init_timestamp();
- perf_event_enable(this_cpu_read(watchdog_ev));
+ perf_event_enable(evt);
}
/**
@@ -263,19 +258,30 @@ bool __weak __init arch_perf_nmi_is_available(void)
*/
int __init watchdog_hardlockup_probe(void)
{
+ struct perf_event *evt;
+ unsigned int cpu;
int ret;
if (!arch_perf_nmi_is_available())
return -ENODEV;
- ret = hardlockup_detector_event_create();
+ if (!hw_nmi_get_sample_period(watchdog_thresh))
+ return -EINVAL;
- if (ret) {
+ /*
+ * Test hardware PMU availability by creating a temporary perf event.
+ * The event is released immediately.
+ */
+ cpu = raw_smp_processor_id();
+ evt = hardlockup_detector_event_create(cpu);
+ if (IS_ERR(evt)) {
pr_info("Perf NMI watchdog permanently disabled\n");
+ ret = PTR_ERR(evt);
} else {
- perf_event_release_kernel(this_cpu_read(watchdog_ev));
- this_cpu_write(watchdog_ev, NULL);
+ perf_event_release_kernel(evt);
+ ret = 0;
}
+
return ret;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 253311af47c6..c515cff01828 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -117,6 +117,8 @@ enum wq_internal_consts {
MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
CREATE_COOLDOWN = HZ, /* time to breath after fail */
+ RESCUER_BATCH = 16, /* process items per turn */
+
/*
* Rescue workers are used only on emergencies and shared by
* all cpus. Give MIN_NICE.
@@ -286,6 +288,7 @@ struct pool_workqueue {
struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */
struct list_head pwqs_node; /* WR: node on wq->pwqs */
struct list_head mayday_node; /* MD: node on wq->maydays */
+ struct work_struct mayday_cursor; /* L: cursor on pool->worklist */
u64 stats[PWQ_NR_STATS];
@@ -1120,6 +1123,12 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
return NULL;
}
+static void mayday_cursor_func(struct work_struct *work)
+{
+ /* should not be processed, only for marking position */
+ BUG();
+}
+
/**
* move_linked_works - move linked works to a list
* @work: start of series of works to be scheduled
@@ -1182,6 +1191,16 @@ static bool assign_work(struct work_struct *work, struct worker *worker,
lockdep_assert_held(&pool->lock);
+ /* The cursor work should not be processed */
+ if (unlikely(work->func == mayday_cursor_func)) {
+ /* only worker_thread() can possibly take this branch */
+ WARN_ON_ONCE(worker->rescue_wq);
+ if (nextp)
+ *nextp = list_next_entry(work, entry);
+ list_del_init(&work->entry);
+ return false;
+ }
+
/*
* A single work shouldn't be executed concurrently by multiple workers.
* __queue_work() ensures that @work doesn't jump to a different pool
@@ -2976,9 +2995,8 @@ static void idle_cull_fn(struct work_struct *work)
reap_dying_workers(&cull_list);
}
-static void send_mayday(struct work_struct *work)
+static void send_mayday(struct pool_workqueue *pwq)
{
- struct pool_workqueue *pwq = get_work_pwq(work);
struct workqueue_struct *wq = pwq->wq;
lockdep_assert_held(&wq_mayday_lock);
@@ -3016,7 +3034,7 @@ static void pool_mayday_timeout(struct timer_list *t)
* rescuers.
*/
list_for_each_entry(work, &pool->worklist, entry)
- send_mayday(work);
+ send_mayday(get_work_pwq(work));
}
raw_spin_unlock(&wq_mayday_lock);
@@ -3440,22 +3458,57 @@ sleep:
static bool assign_rescuer_work(struct pool_workqueue *pwq, struct worker *rescuer)
{
struct worker_pool *pool = pwq->pool;
+ struct work_struct *cursor = &pwq->mayday_cursor;
struct work_struct *work, *n;
- /* need rescue? */
- if (!pwq->nr_active || !need_to_create_worker(pool))
+ /* have work items to rescue? */
+ if (!pwq->nr_active)
return false;
- /*
- * Slurp in all works issued via this workqueue and
- * process'em.
- */
- list_for_each_entry_safe(work, n, &pool->worklist, entry) {
- if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n))
+ /* need rescue? */
+ if (!need_to_create_worker(pool)) {
+ /*
+ * The pool has idle workers and doesn't need the rescuer, so it
+ * could simply return false here.
+ *
+ * However, the memory pressure might not be fully relieved.
+ * In PERCPU pool with concurrency enabled, having idle workers
+ * does not necessarily mean memory pressure is gone; it may
+ * simply mean regular workers have woken up, completed their
+ * work, and gone idle again due to concurrency limits.
+ *
+ * In this case, those working workers may later sleep again,
+ * the pool may run out of idle workers, and it will have to
+ * allocate new ones and wait for the timer to send mayday,
+ * causing unnecessary delay - especially if memory pressure
+ * was never resolved throughout.
+ *
+ * Do more work if memory pressure is still on to reduce
+ * relapse, using (pool->flags & POOL_MANAGER_ACTIVE), though
+ * not precisely, unless there are other PWQs needing help.
+ */
+ if (!(pool->flags & POOL_MANAGER_ACTIVE) ||
+ !list_empty(&pwq->wq->maydays))
+ return false;
+ }
+
+ /* search from the start or cursor if available */
+ if (list_empty(&cursor->entry))
+ work = list_first_entry(&pool->worklist, struct work_struct, entry);
+ else
+ work = list_next_entry(cursor, entry);
+
+ /* find the next work item to rescue */
+ list_for_each_entry_safe_from(work, n, &pool->worklist, entry) {
+ if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) {
pwq->stats[PWQ_STAT_RESCUED]++;
+ /* put the cursor for next search */
+ list_move_tail(&cursor->entry, &n->entry);
+ return true;
+ }
}
- return !list_empty(&rescuer->scheduled);
+ return false;
}
/**
@@ -3512,6 +3565,7 @@ repeat:
struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
struct pool_workqueue, mayday_node);
struct worker_pool *pool = pwq->pool;
+ unsigned int count = 0;
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
@@ -3524,31 +3578,27 @@ repeat:
WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
- if (assign_rescuer_work(pwq, rescuer)) {
+ while (assign_rescuer_work(pwq, rescuer)) {
process_scheduled_works(rescuer);
/*
- * The above execution of rescued work items could
- * have created more to rescue through
- * pwq_activate_first_inactive() or chained
- * queueing. Let's put @pwq back on mayday list so
- * that such back-to-back work items, which may be
- * being used to relieve memory pressure, don't
- * incur MAYDAY_INTERVAL delay inbetween.
+ * If the per-turn work item limit is reached and other
+ * PWQs are in mayday, requeue mayday for this PWQ and
+ * let the rescuer handle the other PWQs first.
*/
- if (pwq->nr_active && need_to_create_worker(pool)) {
+ if (++count > RESCUER_BATCH && !list_empty(&pwq->wq->maydays) &&
+ pwq->nr_active && need_to_create_worker(pool)) {
raw_spin_lock(&wq_mayday_lock);
- /*
- * Queue iff somebody else hasn't queued it already.
- */
- if (list_empty(&pwq->mayday_node)) {
- get_pwq(pwq);
- list_add_tail(&pwq->mayday_node, &wq->maydays);
- }
+ send_mayday(pwq);
raw_spin_unlock(&wq_mayday_lock);
+ break;
}
}
+ /* The cursor can not be left behind without the rescuer watching it. */
+ if (!list_empty(&pwq->mayday_cursor.entry) && list_empty(&pwq->mayday_node))
+ list_del_init(&pwq->mayday_cursor.entry);
+
/*
* Leave this pool. Notify regular workers; otherwise, we end up
* with 0 concurrency and stalling the execution.
@@ -5167,6 +5217,19 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
INIT_LIST_HEAD(&pwq->pwqs_node);
INIT_LIST_HEAD(&pwq->mayday_node);
kthread_init_work(&pwq->release_work, pwq_release_workfn);
+
+ /*
+ * Set the dummy cursor work with valid function and get_work_pwq().
+ *
+ * The cursor work should only be in the pwq->pool->worklist, and
+ * should not be treated as a processable work item.
+ *
+ * WORK_STRUCT_PENDING and WORK_STRUCT_INACTIVE just make it less
+ * surprise for kernel debugging tools and reviewers.
+ */
+ INIT_WORK(&pwq->mayday_cursor, mayday_cursor_func);
+ atomic_long_set(&pwq->mayday_cursor.data, (unsigned long)pwq |
+ WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | WORK_STRUCT_INACTIVE);
}
/* sync @pwq with the current state of its associated wq and link it */
@@ -6959,13 +7022,16 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
}
/**
- * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
- * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
+ * workqueue_unbound_housekeeping_update - Propagate housekeeping cpumask update
+ * @hk: the new housekeeping cpumask
+ *
+ * Update the unbound workqueue cpumask on top of the new housekeeping cpumask such
+ * that the effective unbound affinity is the intersection of the new housekeeping
+ * with the requested affinity set via nohz_full=/isolcpus= or sysfs.
*
- * This function can be called from cpuset code to provide a set of isolated
- * CPUs that should be excluded from wq_unbound_cpumask.
+ * Return: 0 on success and -errno on failure.
*/
-int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
+int workqueue_unbound_housekeeping_update(const struct cpumask *hk)
{
cpumask_var_t cpumask;
int ret = 0;
@@ -6981,14 +7047,14 @@ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
* (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
* by any subsequent write to workqueue/cpumask sysfs file.
*/
- if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask))
+ if (!cpumask_and(cpumask, wq_requested_unbound_cpumask, hk))
cpumask_copy(cpumask, wq_requested_unbound_cpumask);
if (!cpumask_equal(cpumask, wq_unbound_cpumask))
ret = workqueue_apply_unbound_cpumask(cpumask);
/* Save the current isolated cpumask & export it via sysfs */
if (!ret)
- cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
+ cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask, hk);
mutex_unlock(&wq_pool_mutex);
free_cpumask_var(cpumask);
@@ -7505,9 +7571,13 @@ static struct timer_list wq_watchdog_timer;
static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
-static unsigned int wq_panic_on_stall;
+static unsigned int wq_panic_on_stall = CONFIG_BOOTPARAM_WQ_STALL_PANIC;
module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644);
+static unsigned int wq_panic_on_stall_time;
+module_param_named(panic_on_stall_time, wq_panic_on_stall_time, uint, 0644);
+MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds (0=disabled)");
+
/*
* Show workers that might prevent the processing of pending work items.
* The only candidates are CPU-bound workers in the running state.
@@ -7559,14 +7629,25 @@ static void show_cpu_pools_hogs(void)
rcu_read_unlock();
}
-static void panic_on_wq_watchdog(void)
+/*
+ * It triggers a panic in two scenarios: when the total number of stalls
+ * exceeds a threshold, and when a stall lasts longer than
+ * wq_panic_on_stall_time
+ */
+static void panic_on_wq_watchdog(unsigned int stall_time_sec)
{
static unsigned int wq_stall;
if (wq_panic_on_stall) {
wq_stall++;
- BUG_ON(wq_stall >= wq_panic_on_stall);
+ if (wq_stall >= wq_panic_on_stall)
+ panic("workqueue: %u stall(s) exceeded threshold %u\n",
+ wq_stall, wq_panic_on_stall);
}
+
+ if (wq_panic_on_stall_time && stall_time_sec >= wq_panic_on_stall_time)
+ panic("workqueue: stall lasted %us, exceeding threshold %us\n",
+ stall_time_sec, wq_panic_on_stall_time);
}
static void wq_watchdog_reset_touched(void)
@@ -7581,10 +7662,12 @@ static void wq_watchdog_reset_touched(void)
static void wq_watchdog_timer_fn(struct timer_list *unused)
{
unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
+ unsigned int max_stall_time = 0;
bool lockup_detected = false;
bool cpu_pool_stall = false;
unsigned long now = jiffies;
struct worker_pool *pool;
+ unsigned int stall_time;
int pi;
if (!thresh)
@@ -7618,14 +7701,15 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
/* did we stall? */
if (time_after(now, ts + thresh)) {
lockup_detected = true;
+ stall_time = jiffies_to_msecs(now - pool_ts) / 1000;
+ max_stall_time = max(max_stall_time, stall_time);
if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) {
pool->cpu_stall = true;
cpu_pool_stall = true;
}
pr_emerg("BUG: workqueue lockup - pool");
pr_cont_pool_info(pool);
- pr_cont(" stuck for %us!\n",
- jiffies_to_msecs(now - pool_ts) / 1000);
+ pr_cont(" stuck for %us!\n", stall_time);
}
@@ -7638,7 +7722,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
show_cpu_pools_hogs();
if (lockup_detected)
- panic_on_wq_watchdog();
+ panic_on_wq_watchdog(max_stall_time);
wq_watchdog_reset_touched();
mod_timer(&wq_watchdog_timer, jiffies + thresh);